re.sub

Here are the examples of the python api re.sub taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

162 Examples 7

Example 51

Project: Kaggle_HomeDepot Source File: homedepot_functions.py
def str_parser(s, automatic_spell_check_dict={}, remove_from_brackets=False,parse_material=False,add_space_stop_list=[]):
    #the following three replacements are shared on the forum    
    s = s.replace("craftsm,an","craftsman")        
    s = re.sub(r'depot.com/search=', '', s)
    s = re.sub(r'pilers,needlenose', 'pliers, needle nose', s)
    
    s = re.sub(r'\bmr.', 'mr ', s)
    s = re.sub(r'&', '&', s)
    s = re.sub(' ', '', s)
    s = re.sub(''', '', s)
    s = re.sub(r'(?<=[0-9]),[\ ]*(?=[0-9])', '', s)
    s = s.replace(";",".")
    s = s.replace(",",".")
    s = s.replace(":",". ")
    s = s.replace("+"," ")
    s = re.sub(r'\bU.S.', 'US ', s)
    s = s.replace(" W x "," ")
    s = s.replace(" H x "," ")
    s = re.sub(' [\#]\d+[\-\d]*[\,]*', '', s)    
    s = re.sub('(?<=[0-9\%])(?=[A-Z][a-z])', '. ', s) # add dot between number and cap letter
    s = re.sub(r'(?<=\))(?=[a-zA-Z0-9])', ' ', s) # add space between parentheses and letters
    s = re.sub(r'(?<=[a-zA-Z0-9])(?=\()', ' ', s) # add space between parentheses and letters

    if parse_material:
        replace_dict={'Medium Density Fiberboard (MDF)':'mdf', 'High Density Fiberboard (HDF)':'hdf',\
        'Fibre Reinforced Polymer (FRP)': 'frp', 'Acrylonitrile Butadiene Styrene (ABS)': 'abs',\
        'Cross-Linked Polyethylene (PEX)':'pex', 'Chlorinated Poly Vinyl Chloride (CPVC)': 'cpvc',\
        'PVC (vinyl)': 'pvc','Thermoplastic rubber (TPR)':'tpr','Poly Lactic Acid (PLA)': 'pla',\
        '100% Polyester':'polyester','100% UV Olefin':'olefin', '100% BCF Polypropylene': 'polypropylene',\
        '100% PVC':'pvc'}
        
        if s in replace_dict.keys():
            s=replace_dict[s]


    s = re.sub('[^a-zA-Z0-9\n\ \%\$\-\#\@\&\/\.\'\*\(\)]', ' ', s)
    s= " ".join(s.split())

    s=s.replace("-"," ")
    
    if len(add_space_stop_list)>0:
        s = " ".join([re.sub('(?<=[a-z])(?=[A-Z][a-z\ ])', '. ', word)  if word.lower() not in add_space_stop_list else word for word in s.split()])

    s=s.lower() 
    s = re.sub('\.(?=[a-z])', '. ', s) #dots before words -> replace with spaces
   # s = re.sub('(?<=[a-z])(?=[A-Z][a-z\ ])', ' ', s) # add space if uppercase after lowercase
    s = re.sub('(?<=[a-z][a-z][a-z])(?=[0-9])', ' ', s) # add cpase if number after at least three letters
    ##s = re.sub('(?<=[a-zA-Z])\.(?=\ |$)', '', s) #remove dots at the end of string
    #s = re.sub('(?<=[0-9])\.(?=\ |$)', '', s) # dot after digit before space
    s = re.sub('^\.\ ', '', s) #dot at the beginning before space
    

    if len(automatic_spell_check_dict.keys())>0:
        s=spell_correction(s,automatic_spell_check_dict=automatic_spell_check_dict)
    
    if remove_from_brackets==True:
        s = re.sub('(?<=\()[a-zA-Z0-9\n\ \%\$\-\#\@\&\/\.\'\*\(\)]*(?=\))', '', s)
    else:
        s=s.replace(" (",". ")
        s=re.sub('(?<=[a-zA-Z0-9\%\$])\(', '. ', s)
        s=s.replace(" )",". ")
        s=s.replace(")",". ")
        s=s.replace("  "," ")
        s = re.sub('\ \.', '\.', s)
        

    #######s = re.sub('(?<=[0-9\%])(?=[a-wyz])', ' ', s) # add space between number and text (except letter x) 
    #s = re.sub('(?<=[a-zA-Z])-(?=[a-zA-Z])', ' ', s) # replace '-' in words with space
    s=s.replace("at&t","att")
    s=s.replace("&"," and ")    
    s=s.replace("*"," x ")
    s = re.sub('(?<=[a-z\ ])\/(?=[a-z\ ])', ' ', s) # replace "/" between words with space
    s = re.sub('(?<=[a-z])\\\\(?=[a-z])', ' ', s) # replace "/" between words with space
    s=s.replace("  "," ")
    s=s.replace("  "," ")
    
    #s=re.sub('(?<=\ [a-ux-z])\ (?=[0-9])', '', s)   #remove spaces
    #s=re.sub('(?<=^[a-z])\ (?=[0-9])', '', s)   #remove spaces




    #####################################
    ### thesaurus replacement in all vars
    s=replace_in_parser(s)
    
    s = re.sub('half(?=\ inch)', '1/2', s)
    s = re.sub('\ba half\b', '1/2', s)
    #s = re.sub('half\ ', 'half-', s)

    s = re.sub(r'(?<=\')s\b', '', s)
    s = re.sub('(?<=[0-9])\'\'', ' in ', s)
    s = re.sub('(?<=[0-9])\'', ' in ', s)

    s = re.sub(r'(?<=[0-9])[\ ]*inch[es]*\b', '-in ', s)
    s = re.sub(r'(?<=[0-9])[\ ]*in\b', '-in ', s)
    
    s = re.sub(r'(?<=[0-9])[\-|\ ]*feet[s]*\b', '-ft ', s)
    s = re.sub(r'(?<=[0-9])[\ ]*foot[s]*\b', '-ft ', s)
    s = re.sub(r'(?<=[0-9])[\ ]*ft[x]*\b', '-ft ', s)
    
    s = re.sub('(?<=[0-9])[\ ]*volt[s]*(?=\ |$|\.)', '-V ', s)
    s = re.sub('(?<=[0-9])[\ ]*v(?=\ |$|\.)', '-V ', s)
    
    s = re.sub('(?<=[0-9])[\ ]*wat[t]*[s]*(?=\ |$|\.)', '-W ', s)
    s = re.sub('(?<=[0-9])[\ ]*w(?=\ |$|\.)', '-W ', s)
    
    s = re.sub('(?<=[0-9])[\ ]*kilo[\ ]*watt[s]*(?=\ |$|\.)', '-KW ', s)
    s = re.sub('(?<=[0-9])[\ ]*kw(?=\ |$|\.)', '-KW ', s)
    
    s = re.sub('(?<=[0-9])[\ ]*amp[s]*(?=\ |$|\.)', '-A ', s)
    #s = re.sub('(?<=[0-9]) a(?=\ |$|\.)', '-A. ', s)
    s = re.sub('(?<=[0-9])a(?=\ |$|\.)', '-A ', s)

    s = re.sub('(?<=[0-9])[\ ]*gallon[s]*(?=\ |$|\.)', '-gal ', s)
    s = re.sub('(?<=[0-9])[\ ]*gal(?=\ |$|\.)', '-gal ', s)
        
    s = re.sub('(?<=[0-9])[\ ]*pound[s]*(?=\ |$|\.)', '-lb ', s)
    s = re.sub('(?<=[0-9])[\ ]*lb[s]*(?=\ |$|\.)', '-lb ', s)
        
    s = re.sub('(?<=[0-9])[\ ]*mi[l]+imet[er]*[s]*(?=\ |$|\.)', '-mm ', s)
    s = re.sub('(?<=[0-9])[\ ]*mm(?=\ |$|\.)', '-mm ', s)
        
    s = re.sub('(?<=[0-9])[\ ]*centimeter[s]*(?=\ |$|\.)', '-cm ', s)
    s = re.sub('(?<=[0-9])[\ ]*cm(?=\ |$|\.)', '-cm ', s)
        
    s = re.sub('(?<=[0-9])[\ ]*ounce[s]*(?=\ |$|\.)', '-oz ', s)
    s = re.sub('(?<=[0-9])[\ ]*oz(?=\ |$|\.)', '-oz ', s)
    
    s = re.sub('(?<=[0-9])[\ ]*liter[s]*(?=\ |$|\.)', '-L ', s)
    s = re.sub('(?<=[0-9])[\ ]*litre[s]*(?=\ |$|\.)', '-L ', s)
    s = re.sub('(?<=[0-9])[\ ]*l(?=\ |$|\.)', '-L. ', s)
    
    s = re.sub('(?<=[0-9])[\ ]*square feet[s]*(?=\ |$|\.)', '-sqft ', s)
    s = re.sub('(?<=[0-9])square feet[s]*(?=\ |$|\.)', '-sqft ', s)
    s = re.sub('(?<=[0-9])[\ ]*sq[\ |\.|\.\ ]*ft(?=\ |$|\.)', '-sqft ', s)
    s = re.sub('(?<=[0-9])[\ ]*sq. ft(?=\ |$|\.)', '-sqft', s)
    s = re.sub('(?<=[0-9])[\ ]*sq.ft(?=\ |$|\.)', '-sqft', s)
    
    s = re.sub('(?<=[0-9])[\ ]*cubic f[e]*t[s]*(?=\ |$|\.)', '-cuft ', s)
    s = re.sub('(?<=[0-9])[\ ]*cu[\ |\.|\.\ ]*ft(?=\ |$|\.)', '-cuft ', s)
    s = re.sub('(?<=[0-9])[\ ]*cu[\.]*[\ ]*ft(?=\ |$|\.)', '-cuft', s)
    
     
    #remove 'x'
    s = re.sub('(?<=[0-9]) x (?=[0-9])', '-X ', s)
    s = re.sub('(?<=[0-9])x (?=[0-9])', '-X ', s)
    s = re.sub('(?<=[0-9]) x(?=[0-9])', '-X ', s)
    s = re.sub('(?<=[0-9])x(?=[0-9])', '-X ', s)
    
    #s=s.replace("..",".")
    s=s.replace("\n"," ")
    s=s.replace("  "," ")

    words=s.split()

    if s.find("-X")>=0:
        for cnt in range(0,len(words)-1):
            if words[cnt].find("-X")>=0:
                if words[cnt+1].find("-X") and cnt<len(words)-2:
                    cntAdd=2
                else:
                    cntAdd=1
                to_replace=re.search(r'(?<=[0-9]\-)\w+\b',words[cnt+cntAdd])
                if not (to_replace==None):
                    words[cnt]=words[cnt].replace("-X","-"+to_replace.group(0)+"")
                else:
                    words[cnt]=words[cnt].replace("-X","x")
    s = " ".join([word for word in words])
    
    s = re.sub('[^a-zA-Z0-9\ \%\$\-\@\&\/\.]', '', s) #remove "'" and "\n" and "#" and characters
    ##s = re.sub('(?<=[a-zA-Z])[\.|\/](?=\ |$)', '', s) #remove dots at the end of string
    s = re.sub('(?<=[0-9])x(?=\ |$)', '', s) #remove 
    s = re.sub('(?<=[\ ])x(?=[0-9])', '', s) #remove
    s = re.sub('(?<=^)x(?=[0-9])', '', s)
    #s = re.sub('[\ ]\.(?=\ |$)', '', s) #remove dots 
    s=s.replace("  "," ")
    s=s.replace("..",".")
    s = re.sub('\ \.', '', s)
    
    s=re.sub('(?<=\ [ch-hj-np-su-z][a-z])\ (?=[0-9])', '', s) #remove spaces
    s=re.sub('(?<=^[ch-hj-np-su-z][a-z])\ (?=[0-9])', '', s) #remove spaces
    
    s = re.sub('(?<=\ )\.(?=[0-9])', '0.', s)
    s = re.sub('(?<=^)\.(?=[0-9])', '0.', s)
    return " ".join([word for word in s.split()])

Example 52

Project: Amazon-Cloud-Drive-for-KODI Source File: cloudservice.py
Function: addmediafile
    def addMediaFile(self, package, contextType='video', encfs=False, dpath='', epath=''):
        thumbnail = self.cache.getThumbnail(self, package.file.thumbnail,package.file.id)
        listitem = xbmcgui.ListItem(package.file.displayTitle(), iconImage=package.file.thumbnail,
                                thumbnailImage=package.file.thumbnail)

        # audio file, not in "pictures"
        if package.file.type == package.file.AUDIO and contextType != 'image':
            if package.file.hasMeta:
                infolabels = decode_dict({ 'title' : package.file.displayTrackTitle(), 'tracknumber' : package.file.trackNumber, 'artist': package.file.artist, 'album': package.file.album,'genre': package.file.genre,'premiered': package.file.releaseDate, 'size' : package.file.size })
            else:
                infolabels = decode_dict({ 'title' : package.file.displayTitle(), 'size' : package.file.size })
            listitem.setInfo('Music', infolabels)
            playbackURL = '?mode=audio'
            if self.integratedPlayer:
                listitem.setProperty('IsPlayable', 'false')
            else:
                listitem.setProperty('IsPlayable', 'true')

        # encrypted file, viewing in "pictures", assume image
        elif package.file.type == package.file.UNKNOWN and contextType == 'image':
            infolabels = decode_dict({ 'title' : package.file.displayTitle() , 'plot' : package.file.plot })
            listitem.setInfo('Pictures', infolabels)
            playbackURL = '?mode=photo'
            listitem.setProperty('IsPlayable', 'false')

        # encrypted file, viewing in "video", assume video
        elif package.file.type == package.file.UNKNOWN and contextType == 'video':
            infolabels = decode_dict({ 'title' : package.file.displayTitle() ,  'plot' : package.file.plot, 'size' : package.file.size })
            listitem.setInfo('Video', infolabels)
            playbackURL = '?mode=video'
            if self.integratedPlayer:
                listitem.setProperty('IsPlayable', 'false')
            else:
                listitem.setProperty('IsPlayable', 'true')
            if float(package.file.resume) > 0:
                listitem.setProperty('isResumable', 1)



        # encrypted file, viewing in "music", assume audio
        elif package.file.type == package.file.UNKNOWN and contextType == 'audio':
            if package.file.hasMeta:
                infolabels = decode_dict({ 'title' : package.file.displayTrackTitle(), 'tracknumber' : package.file.trackNumber, 'artist': package.file.artist, 'album': package.file.album,'genre': package.file.genre,'premiered': package.file.releaseDate, 'size' : package.file.size })
            else:
                infolabels = decode_dict({ 'title' : package.file.displayTitle(), 'size' : package.file.size })
            listitem.setInfo('Music', infolabels)
            playbackURL = '?mode=audio'
            if self.integratedPlayer:
                listitem.setProperty('IsPlayable', 'false')
            else:
                listitem.setProperty('IsPlayable', 'true')

        # audio file, viewing in "pictures"
        elif package.file.type == package.file.AUDIO and contextType == 'image':
            if package.file.hasMeta:
                infolabels = decode_dict({ 'title' : package.file.displayTrackTitle(), 'tracknumber' : package.file.trackNumber, 'artist': package.file.artist, 'album': package.file.album,'genre': package.file.genre,'premiered': package.file.releaseDate, 'size' : package.file.size })
            else:
                infolabels = decode_dict({ 'title' : package.file.displayTitle(), 'size' : package.file.size })
            listitem.setInfo('Music', infolabels)
            playbackURL = '?mode=audio'
            listitem.setProperty('IsPlayable', 'false')

        # video file
        elif package.file.type == package.file.VIDEO:
            if package.file.hasMeta:
                infolabels = decode_dict({ 'title' : package.file.displayShowTitle() ,  'plot' : package.file.plot, 'TVShowTitle': package.file.show, 'EpisodeName': package.file.showtitle, 'season': package.file.season, 'episode': package.file.episode,'size' : package.file.size })
            else:
                infolabels = decode_dict({ 'title' : package.file.displayTitle() ,  'plot' : package.file.plot, 'size' : package.file.size })
            listitem.setInfo('Video', infolabels)
            playbackURL = '?mode=video'
            if self.integratedPlayer:
                listitem.setProperty('IsPlayable', 'false')
            else:
                listitem.setProperty('IsPlayable', 'true')
            if float(package.file.resume) > 0:
                listitem.setProperty('isResumable', "1")
            if int(package.file.playcount) > 0: #or (float(package.file.resume) > 0 and package.file.duration > 0 and package.file.resume/package.file.duration > (1-self.settskipResume)):
                listitem.setInfo('video', {'playcount':int(package.file.playcount)})

            if int(package.file.resolution[0]) > 0:
                listitem.addStreamInfo('video', {'width': package.file.resolution[1], 'height': package.file.resolution[0], 'duration':package.file.duration})

        # image file
        elif package.file.type == package.file.PICTURE:
            infolabels = decode_dict({ 'title' : package.file.displayTitle() , 'plot' : package.file.plot })
            listitem.setInfo('Pictures', infolabels)
            playbackURL = '?mode=photo'
            listitem.setProperty('IsPlayable', 'false')

        # otherwise, assume video
        else:
            infolabels = decode_dict({ 'title' : package.file.displayTitle() , 'plot' : package.file.plot, 'size' : package.file.size })
            listitem.setInfo('Video', infolabels)
            playbackURL = '?mode=video'
            if self.integratedPlayer:
                listitem.setProperty('IsPlayable', 'false')
            else:
                listitem.setProperty('IsPlayable', 'true')
            if float(package.file.resume) > 0:
                listitem.setProperty('isResumable', 1)

        listitem.setProperty('fanart_image', package.file.fanart)


        cm=[]

        try:
            url = package.getMediaURL()
            cleanURL = re.sub('---', '', url)
            cleanURL = re.sub('&', '---', cleanURL)
        except:
            cleanURL = ''

    #    url = PLUGIN_URL+playbackURL+'&title='+package.file.title+'&filename='+package.file.id+'&instance='+str(self.instanceName)+'&folder='+str(package.folder.id)
        if encfs:
            values = {'instance': self.instanceName, 'dpath': dpath, 'epath': epath, 'encfs': 'true', 'title': package.file.title, 'filename': package.file.id, 'folder': package.folder.id}
        else:
            values = {'instance': self.instanceName, 'title': package.file.title, 'filename': package.file.id, 'folder': package.folder.id}
        url = self.PLUGIN_URL+ str(playbackURL)+ '&' + urllib.urlencode(values)

        if (contextType != 'image' and package.file.type != package.file.PICTURE):
            valuesBS = {'username': self.authorization.username, 'title': package.file.title, 'filename': package.file.id, 'content_type': 'video'}
            cm.append(( self.addon.getLocalizedString(30042), 'XBMC.RunPlugin('+self.PLUGIN_URL+'?mode=buildstrm&type='+str(package.file.type)+'&'+urllib.urlencode(valuesBS)+')', ))

            if (self.protocol == 2):
                # play-original for video only
                if (contextType == 'video'):
                    if self.settings.promptQuality:
                        cm.append(( self.addon.getLocalizedString(30123), 'XBMC.RunPlugin('+url + '&original=true'+')', ))
                    else:
                        cm.append(( self.addon.getLocalizedString(30151), 'XBMC.RunPlugin('+url + '&promptquality=true'+')', ))

                    # if the options are disabled in settings, display option to playback with feature
                    if not self.settings.srt:
                        cm.append(( self.addon.getLocalizedString(30138), 'XBMC.RunPlugin('+url + '&srt=true'+')', ))
                    if not self.settings.cc:
                        cm.append(( self.addon.getLocalizedString(30146), 'XBMC.RunPlugin('+url + '&cc=true'+')', ))

                    cm.append(( self.addon.getLocalizedString(30147), 'XBMC.RunPlugin('+url + '&seek=true'+')', ))
#                    cm.append(( self.addon.getLocalizedString(30148), 'XBMC.RunPlugin('+url + '&resume=true'+')', ))
#                    values = {'instance': self.instanceName, 'folder': package.folder.id}
#                    folderurl = self.PLUGIN_URL+ str(playbackURL)+ '&' + urllib.urlencode(values)
#                    cm.append(( 'folder', 'XBMC.RunPlugin('+folderurl+')', ))

                if contextType != 'image':
                    # download
                    cm.append(( self.addon.getLocalizedString(30113), 'XBMC.RunPlugin('+url + '&download=true'+')', ))

                    # download + watch
                    cm.append(( self.addon.getLocalizedString(30124), 'XBMC.RunPlugin('+url + '&play=true&download=true'+')', ))

#                    # watch downloaded copy
#                    cm.append(( self.addon.getLocalizedString(30125), 'XBMC.RunPlugin('+url + '&cache=true'+')', ))


        elif package.file.type ==  package.file.PICTURE: #contextType == 'image':

                cm.append(( self.addon.getLocalizedString(30126), 'XBMC.RunPlugin('+self.PLUGIN_URL+ '?mode=slideshow&' + urllib.urlencode(values)+')', ))

        #encfs
#        if (self.protocol == 2):
#            cm.append(( self.addon.getLocalizedString(30130), 'XBMC.RunPlugin('+self.PLUGIN_URL+ '?mode=downloadfolder&encfs=true&' + urllib.urlencode(values)+'&content_type='+contextType+')', ))


        url = url + '&content_type='+contextType

        #    listitem.addContextMenuItems( commands )
        #    if cm:
        if  package.file.type ==  package.file.PICTURE: #contextType == 'image':
            listitem.addContextMenuItems(cm, True)
        else:
            listitem.addContextMenuItems(cm, False)

        xbmcplugin.addDirectoryItem(plugin_handle, url, listitem,
                                isFolder=False, totalItems=0)
        return url

Example 53

Project: frescobaldi Source File: vocal.py
    def build(self, data, builder):
        # normalize voicing
        staves = self.voicing.currentText().upper()
        # remove unwanted characters
        staves = re.sub(r'[^SATB-]+', '', staves)
        # remove double hyphens, and from begin and end
        staves = re.sub('-+', '-', staves).strip('-')
        if not staves:
            return
        
        splitStaves = staves.split('-')
        numStaves = len(splitStaves)
        staffCIDs = collections.defaultdict(int)    # number same-name staff Context-IDs
        voiceCounter = collections.defaultdict(int) # dict to number same voice types
        maxNumVoices = max(map(len, splitStaves))   # largest number of voices
        numStanzas = self.stanzas.value()
        lyrics = collections.defaultdict(list)      # lyrics grouped by stanza number
        pianoReduction = collections.defaultdict(list)
        rehearsalMidis = []
        
        p = ly.dom.ChoirStaff()
        choir = ly.dom.Sim(p)
        data.nodes.append(p)
        
        # print main instrumentName if there are more choirs, and we
        # have more than one staff.
        if numStaves > 1 and data.num:
            builder.setInstrumentNames(p,
                builder.instrumentName(lambda _: _("Choir"), data.num),
                builder.instrumentName(lambda _: _("abbreviation for Choir", "Ch."), data.num))
        
        # get the preferred way of adding lyrics
        lyrAllSame, lyrEachSame, lyrEachDiff, lyrSpread = (
            self.lyrics.currentIndex() == i for i in range(4))
        lyrEach = lyrEachSame or lyrEachDiff
        
        # stanzas to print (0 = don't print stanza number):
        if numStanzas == 1:
            allStanzas = [0]
        else:
            allStanzas = list(range(1, numStanzas + 1))
        
        # Which stanzas to print where:
        if lyrSpread and numStanzas > 1 and numStaves > 2:
            spaces = numStaves - 1
            count, rest = divmod(max(numStanzas, spaces), spaces)
            stanzaSource = itertools.cycle(allStanzas)
            stanzaGroups = (itertools.islice(stanzaSource, num)
                            for num in itertools.chain(
                                itertools.repeat(count + 1, rest),
                                itertools.repeat(count, numStaves - rest)))
        else:
            stanzaGroups = itertools.repeat(allStanzas, numStaves)
        
        # a function to set staff affinity (in LilyPond 2.13.4 and above):
        if builder.lyVersion >= (2, 13, 4):
            def setStaffAffinity(context, affinity):
                ly.dom.Line("\\override VerticalAxisGroup "
                     "#'staff-affinity = #" + affinity, context.getWith())
        else:
            def setStaffAffinity(lyricsContext, affinity):
                pass
        
        # a function to make a column markup:
        if builder.lyVersion >= (2, 11, 57):
            columnCommand = 'center-column'
        else:
            columnCommand = 'center-align'
        def makeColumnMarkup(names):
            node = ly.dom.Markup()
            column = ly.dom.MarkupEnclosed(columnCommand, node)
            for name in names:
                ly.dom.QuotedString(name, column)
            return node
        
        stavesLeft = numStaves
        for staff, stanzas in zip(splitStaves, stanzaGroups):
            # are we in the last staff?
            stavesLeft -= 1
            # the number of voices in this staff
            numVoices = len(staff)
            # sort the letters in order SATB
            staff = ''.join(i * staff.count(i) for i in 'SATB')
            # Create the staff for the voices
            s = ly.dom.Staff(parent=choir)
            builder.setMidiInstrument(s, self.midiInstrument)
            
            # Build a list of the voices in this staff.
            # Each entry is a tuple(name, num).
            # name is one of 'S', 'A', 'T', or 'B'
            # num is an integer: 0 when a voice occurs only once, or >= 1 when
            # there are more voices of the same type (e.g. Soprano I and II)
            voices = []
            for voice in staff:
                if staves.count(voice) > 1:
                    voiceCounter[voice] += 1
                voices.append((voice, voiceCounter[voice]))
            
            # Add the instrument names to the staff:
            if numVoices == 1:
                voice, num = voices[0]
                longName = builder.instrumentName(voice2Voice[voice].title, num)
                shortName = builder.instrumentName(voice2Voice[voice].short, num)
                builder.setInstrumentNames(s, longName, shortName)
            else:
                # stack instrument names (long and short) in a markup column.
                # long names
                longNames = makeColumnMarkup(
                    builder.instrumentName(voice2Voice[voice].title, num) for voice, num in voices)
                shortNames = makeColumnMarkup(
                    builder.instrumentName(voice2Voice[voice].short, num) for voice, num in voices)
                builder.setInstrumentNames(s, longNames, shortNames)
            
            # Make the { } or << >> holder for this staff's children.
            # If *all* staves have only one voice, addlyrics is used.
            # In that case, don't remove the braces.
            staffMusic = (ly.dom.Seq if lyrEach and maxNumVoices == 1 else
                          ly.dom.Seqr if numVoices == 1 else ly.dom.Simr)(s)
            
            # Set the clef for this staff:
            if 'B' in staff:
                ly.dom.Clef('bass', staffMusic)
            elif 'T' in staff:
                ly.dom.Clef('treble_8', staffMusic)

            # Determine voice order (\voiceOne, \voiceTwo etc.)
            if numVoices == 1:
                order = (0,)
            elif numVoices == 2:
                order = 1, 2
            elif staff in ('SSA', 'TTB'):
                order = 1, 3, 2
            elif staff in ('SAA', 'TBB'):
                order = 1, 2, 4
            elif staff in ('SSAA', 'TTBB'):
                order = 1, 3, 2, 4
            else:
                order = range(1, numVoices + 1)
            
            # What name would the staff get if we need to refer to it?
            # If a name (like 's' or 'sa') is already in use in this part,
            # just add a number ('ss2' or 'sa2', etc.)
            staffCIDs[staff] += 1
            cid = ly.dom.Reference(staff.lower() +
                str(staffCIDs[staff] if staffCIDs[staff] > 1 else ""))
            
            # Create voices and their lyrics:
            for (voice, num), voiceNum in zip(voices, order):
                name = voice2id[voice]
                if num:
                    name += ly.util.int2text(num)
                a = data.assignMusic(name, voice2Voice[voice].octave)
                lyrName = name + 'Verse' if lyrEachDiff else 'verse'
            
                # Use \addlyrics if all staves have exactly one voice.
                if lyrEach and maxNumVoices == 1:
                    for verse in stanzas:
                        lyrics[verse].append((ly.dom.AddLyrics(s), lyrName))
                    ly.dom.Identifier(a.name, staffMusic)
                else:
                    voiceName = voice2id[voice] + str(num or '')
                    v = ly.dom.Voice(voiceName, parent=staffMusic)
                    voiceMusic = ly.dom.Seqr(v)
                    if voiceNum:
                        ly.dom.Text('\\voice' + ly.util.int2text(voiceNum), voiceMusic)
                    ly.dom.Identifier(a.name, voiceMusic)
                    
                    if stanzas and (lyrEach or (voiceNum <= 1 and
                                    (stavesLeft or numStaves == 1))):
                        # Create the lyrics. If they should be above the staff,
                        # give the staff a suitable name, and use alignAbove-
                        # Context to align the Lyrics above the staff.
                        above = voiceNum & 1 if lyrEach else False
                        if above and s.cid is None:
                            s.cid = cid

                        for verse in stanzas:
                            l = ly.dom.Lyrics(parent=choir)
                            if above:
                                l.getWith()['alignAboveContext'] = cid
                                setStaffAffinity(l, "DOWN")
                            elif not lyrEach and stavesLeft:
                                setStaffAffinity(l, "CENTER")
                            lyrics[verse].append((ly.dom.LyricsTo(voiceName, l), lyrName))

                # Add ambitus:
                if self.ambitus.isChecked():
                    ambitusContext = (s if numVoices == 1 else v).getWith()
                    ly.dom.Line('\\consists "Ambitus_engraver"', ambitusContext)
                    if voiceNum > 1:
                        ly.dom.Line("\\override Ambitus #'X-offset = #{0}".format(
                                 (voiceNum - 1) * 2.0), ambitusContext)
            
                pianoReduction[voice].append(a.name)
                rehearsalMidis.append((voice, num, a.name, lyrName))
            
        # Assign the lyrics, so their definitions come after the note defs.
        # (These refs are used again below in the midi rehearsal routine.)
        refs = {}
        for verse in allStanzas:
            for node, name in lyrics[verse]:
                if (name, verse) not in refs:
                    refs[(name, verse)] = self.assignLyrics(data, name, verse).name
                ly.dom.Identifier(refs[(name, verse)], node)

        # Create the piano reduction if desired
        if self.pianoReduction.isChecked():
            a = data.assign('pianoReduction')
            data.nodes.append(ly.dom.Identifier(a.name))
            piano = ly.dom.PianoStaff(parent=a)
            
            sim = ly.dom.Sim(piano)
            rightStaff = ly.dom.Staff(parent=sim)
            leftStaff = ly.dom.Staff(parent=sim)
            right = ly.dom.Seq(rightStaff)
            left = ly.dom.Seq(leftStaff)
            
            # Determine the ordering of voices in the staves
            upper = pianoReduction['S'] + pianoReduction['A']
            lower = pianoReduction['T'] + pianoReduction['B']
            
            preferUpper = 1
            if not upper:
                # Male choir
                upper = pianoReduction['T']
                lower = pianoReduction['B']
                ly.dom.Clef("treble_8", right)
                ly.dom.Clef("bass", left)
                preferUpper = 0
            elif not lower:
                # Female choir
                upper = pianoReduction['S']
                lower = pianoReduction['A']
            else:
                ly.dom.Clef("bass", left)

            # Otherwise accidentals can be confusing
            ly.dom.Line("#(set-accidental-style 'piano)", right)
            ly.dom.Line("#(set-accidental-style 'piano)", left)
            
            # Move voices if unevenly spread
            if abs(len(upper) - len(lower)) > 1:
                voices = upper + lower
                half = (len(voices) + preferUpper) // 2
                upper = voices[:half]
                lower = voices[half:]
            
            for staff, voices in (ly.dom.Simr(right), upper), (ly.dom.Simr(left), lower):
                if voices:
                    for v in voices[:-1]:
                        ly.dom.Identifier(v, staff)
                        ly.dom.VoiceSeparator(staff).after = 1
                    ly.dom.Identifier(voices[-1], staff)

            # Make the piano part somewhat smaller
            ly.dom.Line("fontSize = #-1", piano.getWith())
            ly.dom.Line("\\override StaffSymbol #'staff-space = #(magstep -1)",
                piano.getWith())
            
            # Nice to add Mark engravers
            ly.dom.Line('\\consists "Mark_engraver"', rightStaff.getWith())
            ly.dom.Line('\\consists "Metronome_mark_engraver"', rightStaff.getWith())
            
            # Keep piano reduction out of the MIDI output
            if builder.midi:
                ly.dom.Line('\\remove "Staff_performer"', rightStaff.getWith())
                ly.dom.Line('\\remove "Staff_performer"', leftStaff.getWith())
        
        # Create MIDI files if desired
        if self.rehearsalMidi.isChecked():
            a = data.assign('rehearsalMidi')
            rehearsalMidi = a.name
            
            func = ly.dom.SchemeList(a)
            func.pre = '#\n(' # hack
            ly.dom.Text('define-music-function', func)
            ly.dom.Line('(parser location name midiInstrument lyrics) '
                 '(string? string? ly:music?)', func)
            choir = ly.dom.Sim(ly.dom.Command('unfoldRepeats', ly.dom.SchemeLily(func)))
            
            data.afterblocks.append(ly.dom.Comment(_("Rehearsal MIDI files:")))
            
            for voice, num, ref, lyrName in rehearsalMidis:
                # Append voice to the rehearsalMidi function
                name = voice2id[voice] + str(num or '')
                seq = ly.dom.Seq(ly.dom.Voice(name, parent=ly.dom.Staff(name, parent=choir)))
                if builder.lyVersion < (2, 18, 0):
                    ly.dom.Text('<>\\f', seq) # add one dynamic
                ly.dom.Identifier(ref, seq) # add the reference to the voice
                
                book = ly.dom.Book()
                
                # Append score to the aftermath (stuff put below the main score)
                suffix = "choir{0}-{1}".format(data.num, name) if data.num else name
                if builder.lyVersion < (2, 12, 0):
                    data.afterblocks.append(
                        ly.dom.Line('#(define output-suffix "{0}")'.format(suffix)))
                else:
                    ly.dom.Line('\\bookOutputSuffix "{0}"'.format(suffix), book)
                data.afterblocks.append(book)
                data.afterblocks.append(ly.dom.BlankLine())
                score = ly.dom.Score(book)
                
                # TODO: make configurable
                midiInstrument = voice2Midi[voice]

                cmd = ly.dom.Command(rehearsalMidi, score)
                ly.dom.QuotedString(name, cmd)
                ly.dom.QuotedString(midiInstrument, cmd)
                ly.dom.Identifier(refs[(lyrName, allStanzas[0])], cmd)
                ly.dom.Midi(score)
            
            ly.dom.Text("\\context Staff = $name", choir)
            seq = ly.dom.Seq(choir)
            ly.dom.Line("\\set Score.midiMinimumVolume = #0.5", seq)
            ly.dom.Line("\\set Score.midiMaximumVolume = #0.5", seq)
            ly.dom.Line("\\set Score.tempoWholesPerMinute = #" + data.scoreProperties.schemeMidiTempo(), seq)
            ly.dom.Line("\\set Staff.midiMinimumVolume = #0.8", seq)
            ly.dom.Line("\\set Staff.midiMaximumVolume = #1.0", seq)
            ly.dom.Line("\\set Staff.midiInstrument = $midiInstrument", seq)
            lyr = ly.dom.Lyrics(parent=choir)
            lyr.getWith()['alignBelowContext'] = ly.dom.Text('$name')
            ly.dom.Text("\\lyricsto $name $lyrics", lyr)

Example 54

Project: cgat Source File: IndexedFasta.py
def createDatabase(db, iterator,
                   force=False,
                   synonyms=None,
                   compression=None,
                   random_access_points=None,
                   regex_identifier=None,
                   clean_sequence=False,
                   ignore_duplicates=False,
                   allow_duplicates=False,
                   translator=None):
    """index files in filenames to create database.

    Two new files are created - db.fasta and db_name.idx

    If compression is enabled, provide random access points
    every # bytes.

    Dictzip is treated as an uncompressed file.

    regex_identifier: pattern to extract identifier from description line.
    If None, the part until the first white-space character is used.

    translator: specify a translator
    """

    if db.endswith(".fasta"):
        db = db[:-len(".fasta")]

    if compression:
        if compression == "lzo":
            import lzo

            def lzo_mangler(s):
                return lzo.compress(s, 9)
            mangler = lzo_mangler
            db_name = db + ".lzo"
            write_chunks = True
        elif compression == "zlib":
            def zlib_mangler(s):
                return zlib.compress(s, 9)
            mangler = zlib_mangler
            db_name = db + ".zlib"
            write_chunks = True
        elif compression == "gzip":
            mangler = gzip_mangler
            db_name = db + ".gz"
            write_chunks = True
        elif compression == "dictzip":
            from . import dictzip

            def mangler(x):
                return x

            db_name = db + ".dz"
            write_chunks = False
        elif compression == "bzip2":
            import bz2

            def bzip_mangler(x):
                return bz2.compress(x, 9)

            mangler = bzip_mangler
            db_name = db + ".bz2"
            write_chunks = True
        elif compression == "debug":
            def mangler(x):
                return x
            db_name = db + ".debug"
            write_chunks = True
        elif compression == "rle":
            from . import RLE
            mangler = RLE.compress
            db_name = db + ".rle"
            write_chunks = True
        else:
            raise ValueError("unknown compression library: %s" % compression)

        index_name = db + ".cdx"

        if write_chunks and random_access_points is None \
           or random_access_points <= 0:
            raise ValueError("specify chunksize in --random-access-points")

    else:
        def mangler(x):
            return x
        db_name = db + ".fasta"
        write_chunks = False
        index_name = db + ".idx"

    if os.path.exists(db_name) and not force:
        raise ValueError("database %s already exists." % db_name)

    if os.path.exists(index_name) and not force:
        raise ValueError("database index %s already exists." % index_name)

    outfile_index = open(index_name, "w")
    if compression == "dictzip":
        if random_access_points is None or random_access_points <= 0:
            raise ValueError(
                "specify dictzip chunksize in --random-access-points")
        outfile_fasta = dictzip.open(
            db_name, "wb", buffersize=1000000, chunksize=random_access_points)
        compression = None
    else:
        outfile_fasta = open(db_name, "w")

    identifiers = {}
    lsequence = 0
    identifier_pos, sequence_pos = 0, 0

    if sys.version_info.major >= 3:
        translation = str.maketrans("xX", "nN")
    else:
        translation = string.maketrans("xX", "nN")

    fragments = []
    lfragment = 0

    last_identifier = None

    while 1:

        try:
            result = next(iterator)
        except StopIteration:
            break

        if not result:
            break

        is_new, identifier, fragment = result

        if is_new:
            # check for duplicate identifiers
            if identifier in identifiers:
                if ignore_duplicates:
                    raise ValueError("ignore duplicates not implemented")
                elif allow_duplicates:
                    # the current implementation will fail if the same
                    # identifiers
                    # are directly succeeding each other
                    # better: add return to iterator that indicates a new
                    # identifier
                    out_identifier = identifier + \
                        "_%i" % (identifiers[identifier])
                    identifiers[identifier] += 1
                    identifiers[out_identifier] = 1
                else:
                    raise ValueError("%s occurs more than once" %
                                     (identifier,))
            else:
                identifiers[identifier] = 1
                out_identifier = identifier

            if last_identifier:
                if write_chunks:
                    writeFragments(outfile_fasta, outfile_index,
                                   fragments, mangler,
                                   size=random_access_points,
                                   write_all=True)

                    fragments = []
                    lfragment = 0
                else:
                    outfile_fasta.write("\n")

                outfile_index.write("\t%i\n" % lsequence)

            identifier_pos = outfile_fasta.tell()
            outfile_fasta.write(mangler(">%s\n" % out_identifier))
            sequence_pos = outfile_fasta.tell()

            outfile_index.write("%s\t%i" % (out_identifier,
                                            identifier_pos))
            if write_chunks:
                outfile_index.write("\t%i" % random_access_points)
            else:
                outfile_index.write("\t%i" % sequence_pos)

            fragments = []
            lsequence = 0
            last_identifier = identifier

        if translator:
            s = translator(fragment)
        else:
            s = re.sub("\s", "", fragment.strip())
            if clean_sequence:
                s = s.translate(translation)

        lsequence += len(s)

        if write_chunks:
            fragments.append(s)
            lfragment += len(s)
            if lfragment > random_access_points:
                rest = writeFragments(outfile_fasta,
                                      outfile_index,
                                      fragments,
                                      mangler,
                                      size=random_access_points,
                                      write_all=False)
                fragments = [rest]
                lfragment = len(rest)
        else:
            outfile_fasta.write(mangler(s))

    if write_chunks:
        writeFragments(outfile_fasta, outfile_index, fragments, mangler,
                       size=random_access_points, write_all=True)
    else:
        outfile_fasta.write("\n")

    outfile_index.write("\t%i\n" % lsequence)

    # add synonyms for the table
    if synonyms:
        for key, vals in list(synonyms.items()):
            for val in vals:
                outfile_index.write("%s\t%s\n" % (key, val))

Example 55

Project: pyxform Source File: xls2json_backends.py
def xls_to_dict(path_or_file):
    """
    Return a Python dictionary with a key for each worksheet
    name. For each sheet there is a list of dictionaries, each
    dictionary corresponds to a single row in the worksheet. A
    dictionary has keys taken from the column headers and values
    equal to the cell value for that row and column.
    All the keys and leaf elements are unicode text.
    """
    try:
        if isinstance(path_or_file, basestring):
            workbook = xlrd.open_workbook(filename=path_or_file)
        else:
            workbook = xlrd.open_workbook(file_contents=path_or_file.read())
    except XLRDError as e:
        raise PyXFormError("Error reading .xls file: %s" % e.message)

    def xls_value_to_unicode(value, value_type):
        """
        Take a xls formatted value and try to make a unicode string
        representation.
        """
        if value_type == xlrd.XL_CELL_BOOLEAN:
            return u"TRUE" if value else u"FALSE"
        elif value_type == xlrd.XL_CELL_NUMBER:
            # Try to display as an int if possible.
            int_value = int(value)
            if int_value == value:
                return unicode(int_value)
            else:
                return unicode(value)
        elif value_type is xlrd.XL_CELL_DATE:
            # Warn that it is better to single quote as a string.
            # error_location = cellFormatString % (ss_row_idx, ss_col_idx)
            # raise Exception(
            #   "Cannot handle excel formatted date at " + error_location)
            datetime_or_time_only = xlrd.xldate_as_tuple(
                value, workbook.datemode)
            if datetime_or_time_only[:3] == (0, 0, 0):
                # must be time only
                return unicode(datetime.time(*datetime_or_time_only[3:]))
            return unicode(datetime.datetime(*datetime_or_time_only))
        else:
            # ensure unicode and replace nbsp spaces with normal ones
            # to avoid this issue:
            # https://github.com/modilabs/pyxform/issues/83
            return unicode(value).replace(unichr(160), ' ')

    def xls_to_dict_normal_sheet(sheet):
        def iswhitespace(string):
            return (
                isinstance(string, basestring) and len(string.strip()) == 0)

        # Check for duplicate column headers
        column_header_list = list()
        for column in range(0, sheet.ncols):
            column_header = sheet.cell_value(0, column)
            if column_header in column_header_list:
                raise PyXFormError(
                    u"Duplicate column header: %s" % column_header)
            # xls file with 3 columns mostly have a 3 more columns that are
            # blank by default or something, skip during check
            if column_header is not None:
                if not iswhitespace(column_header):
                    column_header_list.append(column_header)

        result = []
        for row in range(1, sheet.nrows):
            row_dict = OrderedDict()
            for column in range(0, sheet.ncols):
                # Changing to cell_value function
                # convert to string, in case it is not string
                key = u"%s" % sheet.cell_value(0, column)
                key = key.strip()
                value = sheet.cell_value(row, column)
                # remove whitespace at the beginning and end of value
                if isinstance(value, basestring):
                    value = value.strip()
                value_type = sheet.cell_type(row, column)
                if value is not None:
                    if not iswhitespace(value):
                        row_dict[key] = xls_value_to_unicode(value, value_type)
                # Taking this condition out so I can get accurate row numbers.
                # TODO: Do the same for csvs
                # if row_dict != {}:
            result.append(row_dict)
        return result, _list_to_dict_list(column_header_list)

    def xls_value_from_sheet(sheet, row, column):
        value = sheet.cell_value(row, column)
        value_type = sheet.cell_type(row, column)
        if value is not None and value != "":
            return xls_value_to_unicode(value, value_type)
        else:
            raise PyXFormError("Empty Value")

    def _xls_to_dict_cascade_sheet(sheet):
        result = []
        rs_dict = OrderedDict()  # tmp dict to hold entire structure

        def slugify(s):
            return re.sub(r'\W+', '_', s.strip().lower())

        prefix = "$PREFIX$"
        # get col headers and position first, ignore first column
        for column in range(1, sheet.ncols):
            col_name = sheet.cell_value(0, column)
            rs_dict[col_name] = {
                'pos': column,
                'data': [],
                'itemset': col_name,
                'type': constants.SELECT_ONE,
                'name':
                    prefix if (column == sheet.ncols - 1) else u''.join(
                        [prefix, '_', col_name]),
                'label': sheet.cell_value(1, column)}
            if column > 1:
                rs_dict[col_name]['parent'] = sheet.cell_value(0, column - 1)
            else:
                rs_dict[col_name]['choices'] = []
            choice_filter = ''
            for a in range(1, column):
                prev_col_name = sheet.cell_value(0, a)
                if choice_filter != '':
                    choice_filter += ' and %s=${%s_%s}' % \
                                     (prev_col_name, prefix, prev_col_name)
                else:
                    choice_filter += '%s=${%s_%s}' % \
                                     (prev_col_name, prefix, prev_col_name)
            rs_dict[col_name]['choice_filter'] = choice_filter
        # get data, use new cascade dict structure, data starts on 3 row
        for row in range(2, sheet.nrows):
            # go through each header aka column
            for col_name in rs_dict:
                column = rs_dict[col_name]['pos']
                cell_data = xls_value_from_sheet(sheet, row, column)
                try:
                    rs_dict[col_name]['data'].index(slugify(cell_data))
                except ValueError:
                    rs_dict[col_name]['data'].append(slugify(cell_data))
                    if 'choices' in rs_dict[col_name]:
                        l = {'name': slugify(cell_data), 'label': cell_data}
                        rs_dict[col_name]['choices'].append(l)
                data = {
                    'name': slugify(cell_data),
                    'label': cell_data.strip(),
                    constants.LIST_NAME: col_name
                }
                for prev_column in range(1, column):
                    prev_col_name = sheet.cell_value(0, prev_column)
                    data[prev_col_name] = slugify(xls_value_from_sheet(
                        sheet, row, prev_column))
                result.append(data)
        # order
        kl = []
        for column in range(1, sheet.ncols):
            col_name = sheet.cell_value(0, column)
            if 'parent' in rs_dict[col_name]:
                rs_dict[col_name].pop('parent')
            if 'pos' in rs_dict[col_name]:
                rs_dict[col_name].pop('pos')
            if 'data' in rs_dict[col_name]:
                rs_dict[col_name].pop('data')
            kl.append(rs_dict[col_name])

            # create list with no duplicates
        choices = []
        for rec in result:
            c = 0
            for check in result:
                if rec == check:
                    c += 1
            if c == 1:
                choices.append(rec)
            else:
                try:
                    choices.index(rec)
                except ValueError:
                    choices.append(rec)
        return [{'choices': choices, 'questions': kl}]

    result = OrderedDict()
    for sheet in workbook.sheets():
        if sheet.name == constants.CASCADING_CHOICES:
            result[sheet.name] = _xls_to_dict_cascade_sheet(sheet)
        else:
            result[sheet.name], result[u"%s_header" % sheet.name] = \
                xls_to_dict_normal_sheet(sheet)
    return result

Example 56

Project: termite-data-server Source File: jsmin.py
Function: make_jsmin
def _make_jsmin(extended=True, python_only=True):
    """
    Generate JS minifier based on `jsmin.c by Douglas Crockford`_

    .. _jsmin.c by Douglas Crockford:
       http://www.crockford.com/javascript/jsmin.c

    :Parameters:
      `extended` : ``bool``
        Extended Regexps? (using lookahead and lookbehind). This is faster,
        because it can be optimized way more. The regexps used with `extended`
        being false are only left here to allow easier porting to platforms
        without extended regex features (and for my own reference...)

      `python_only` : ``bool``
        Use only the python variant. If true, the c extension is not even
        tried to be loaded.

    :Return: Minifier
    :Rtype: ``callable``
    """
    # pylint: disable = R0912, R0914, W0612
    if not python_only:
        try:
            import _rjsmin
        except ImportError:
            pass
        else:
            return _rjsmin.jsmin
    try:
        xrange
    except NameError:
        xrange = range  # pylint: disable = W0622

    space_chars = r'[\000-\011\013\014\016-\040]'

    line_comment = r'(?://[^\r\n]*)'
    space_comment = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
    string1 = \
        r'(?:\047[^\047\\\r\n]*(?:\\(?:[^\r\n]|\r?\n|\r)[^\047\\\r\n]*)*\047)'
    string2 = r'(?:"[^"\\\r\n]*(?:\\(?:[^\r\n]|\r?\n|\r)[^"\\\r\n]*)*")'
    strings = r'(?:%s|%s)' % (string1, string2)

    charclass = r'(?:\[[^\\\]\r\n]*(?:\\[^\r\n][^\\\]\r\n]*)*\])'
    nospecial = r'[^/\\\[\r\n]'
    if extended:
        regex = r'(?:/(?![\r\n/*])%s*(?:(?:\\[^\r\n]|%s)%s*)*/)' % (
            nospecial, charclass, nospecial
        )
    else:
        regex = (
            r'(?:/(?:[^*/\\\r\n\[]|%s|\\[^\r\n])%s*(?:(?:\\[^\r\n]|%s)%s*)*/)'
        )
        regex = regex % (charclass, nospecial, charclass, nospecial)

    space = r'(?:%s|%s)' % (space_chars, space_comment)
    newline = r'(?:%s?[\r\n])' % line_comment

    def fix_charclass(result):
        """ Fixup string of chars to fit into a regex char class """
        pos = result.find('-')
        if pos >= 0:
            result = r'%s%s-' % (result[:pos], result[pos + 1:])

        def sequentize(string):
            """
            Notate consecutive characters as sequence

            (1-4 instead of 1234)
            """
            first, last, result = None, None, []
            for char in map(ord, string):
                if last is None:
                    first = last = char
                elif last + 1 == char:
                    last = char
                else:
                    result.append((first, last))
                    first = last = char
            if last is not None:
                result.append((first, last))
            return ''.join(['%s%s%s' % (
                chr(first),
                last > first + 1 and '-' or '',
                last != first and chr(last) or ''
            ) for first, last in result])

        return _re.sub(r'([\000-\040\047])',  # for better portability
                       lambda m: '\\%03o' % ord(m.group(1)), (sequentize(result)
                                                              .replace('\\', '\\\\')
                                                              .replace('[', '\\[')
                                                              .replace(']', '\\]')
                                                              )
                       )

    def id_literal_(what):
        """ Make id_literal like char class """
        match = _re.compile(what).match
        result = ''.join([
            chr(c) for c in xrange(127) if not match(chr(c))
        ])
        return '[^%s]' % fix_charclass(result)

    def not_id_literal_(keep):
        """ Make negated id_literal like char class """
        match = _re.compile(id_literal_(keep)).match
        result = ''.join([
            chr(c) for c in xrange(127) if not match(chr(c))
        ])
        return r'[%s]' % fix_charclass(result)

    not_id_literal = not_id_literal_(r'[a-zA-Z0-9_$]')
    preregex1 = r'[(,=:\[!&|?{};\r\n]'
    preregex2 = r'%(not_id_literal)sreturn' % locals()

    if extended:
        id_literal = id_literal_(r'[a-zA-Z0-9_$]')
        id_literal_open = id_literal_(r'[a-zA-Z0-9_${\[(+-]')
        id_literal_close = id_literal_(r'[a-zA-Z0-9_$}\])"\047+-]')

        space_sub = _re.compile((
            r'([^\047"/\000-\040]+)'
            r'|(%(strings)s[^\047"/\000-\040]*)'
            r'|(?:(?<=%(preregex1)s)%(space)s*(%(regex)s[^\047"/\000-\040]*))'
            r'|(?:(?<=%(preregex2)s)%(space)s*(%(regex)s[^\047"/\000-\040]*))'
            r'|(?<=%(id_literal_close)s)'
            r'%(space)s*(?:(%(newline)s)%(space)s*)+'
            r'(?=%(id_literal_open)s)'
            r'|(?<=%(id_literal)s)(%(space)s)+(?=%(id_literal)s)'
            r'|%(space)s+'
            r'|(?:%(newline)s%(space)s*)+'
        ) % locals()).sub

        def space_subber(match):
            """ Substitution callback """
            # pylint: disable = C0321, R0911
            groups = match.groups()
            if groups[0]:
                return groups[0]
            elif groups[1]:
                return groups[1]
            elif groups[2]:
                return groups[2]
            elif groups[3]:
                return groups[3]
            elif groups[4]:
                return '\n'
            elif groups[5]:
                return ' '
            else:
                return ''

        def jsmin(script):  # pylint: disable = W0621
            r"""
            Minify javascript based on `jsmin.c by Douglas Crockford`_\.

            Instead of parsing the stream char by char, it uses a regular
            expression approach which minifies the whole script with one big
            substitution regex.

            .. _jsmin.c by Douglas Crockford:
               http://www.crockford.com/javascript/jsmin.c

            :Parameters:
              `script` : ``str``
                Script to minify

            :Return: Minified script
            :Rtype: ``str``
            """
            return space_sub(space_subber, '\n%s\n' % script).strip()

    else:
        pre_regex = r'(?:%(preregex1)s|%(preregex2)s)' % locals()
        not_id_literal_open = not_id_literal_(r'[a-zA-Z0-9_${\[(+-]')
        not_id_literal_close = not_id_literal_(r'[a-zA-Z0-9_$}\])"\047+-]')

        space_norm_sub = _re.compile((
            r'(%(strings)s)'
            r'|(?:(%(pre_regex)s)%(space)s*(%(regex)s))'
            r'|(%(space)s)+'
            r'|(?:(%(newline)s)%(space)s*)+'
        ) % locals()).sub

        def space_norm_subber(match):
            """ Substitution callback """
            # pylint: disable = C0321
            groups = match.groups()
            if groups[0]:
                return groups[0]
            elif groups[1]:
                return groups[1].replace('\r', '\n') + groups[2]
            elif groups[3]:
                return ' '
            elif groups[4]:
                return '\n'

        space_sub1 = _re.compile((
            r'[\040\n]?(%(strings)s|%(pre_regex)s%(regex)s)'
            r'|\040(%(not_id_literal)s)'
            r'|\n(%(not_id_literal_open)s)'
        ) % locals()).sub

        def space_subber1(match):
            """ Substitution callback """
            groups = match.groups()
            return groups[0] or groups[1] or groups[2]

        space_sub2 = _re.compile((
            r'(%(strings)s)\040?'
            r'|(%(pre_regex)s%(regex)s)[\040\n]?'
            r'|(%(not_id_literal)s)\040'
            r'|(%(not_id_literal_close)s)\n'
        ) % locals()).sub

        def space_subber2(match):
            """ Substitution callback """
            groups = match.groups()
            return groups[0] or groups[1] or groups[2] or groups[3]

        def jsmin(script):
            r"""
            Minify javascript based on `jsmin.c by Douglas Crockford`_\.

            Instead of parsing the stream char by char, it uses a regular
            expression approach. The script is minified with three passes:

            normalization
                Control character are mapped to spaces, spaces and newlines
                are squeezed and comments are stripped.
            space removal 1
                Spaces before certain tokens are removed
            space removal 2
                Spaces after certain tokens are remove

            .. _jsmin.c by Douglas Crockford:
               http://www.crockford.com/javascript/jsmin.c

            :Parameters:
              `script` : ``str``
                Script to minify

            :Return: Minified script
            :Rtype: ``str``
            """
            return space_sub2(space_subber2,
                              space_sub1(space_subber1,
                                         space_norm_sub(space_norm_subber,
                                                        '\n%s\n' % script)
                                         )
                              ).strip()
    return jsmin

Example 57

Project: cmdbac Source File: basedeployer.py
    def save_attempt(self, attempt_result, driver_result = {}):
        LOG.info("Saving attempt ...")

        # flush log
        self.flush_log()

        # get info
        register_result = driver_result.get('register', USER_STATUS_UNKNOWN)
        login_result = driver_result.get('login', USER_STATUS_UNKNOWN)
        forms = driver_result.get('forms', None)
        urls = driver_result.get('urls', None)
        screenshot_path = driver_result.get('screenshot', None)
        statistics = driver_result.get('statistics', None)
        informations = driver_result.get('informations', None)

        # get runtime
        if self.runtime == None:
            self.runtime = self.get_runtime()
        Runtime.objects.get_or_create(executable = self.runtime['executable'], version = self.runtime['version'])
        runtime = Runtime.objects.get(executable = self.runtime['executable'], version = self.runtime['version'])

        # save attempt
        self.attempt.result = attempt_result
        self.attempt.login = login_result
        self.attempt.register = register_result
        self.attempt.stop_time = datetime.now()
        self.attempt.size = utils.get_size(self.base_path)
        self.attempt.runtime = runtime
        self.attempt.actions_count = 0
        self.attempt.queries_count = 0
        if forms == None and urls == None and self.attempt.result == ATTEMPT_STATUS_SUCCESS:
            self.attempt.result = ATTEMPT_STATUS_NO_QUERIES

        self.attempt.save()

        # save forms
        if forms != None:
            url_patterns = set()
            for f in forms:
                try:
                    if '/admin' in f['url']:
                        continue
                    url_pattern = re.sub('\d', '', f['url'])
                    if url_pattern in url_patterns:
                        continue
                    url_patterns.add(url_pattern)
                    action = Action()
                    action.url = f['url']
                    if f['method'] == '':
                        f['method'] = 'get'
                    action.method = f['method'].upper()
                    action.attempt = self.attempt
                    action.save()
                    self.attempt.actions_count += 1
                    for q in f['queries']:
                        try:
                            query = Query()
                            query.content = q['content']
                            query.matched = q['matched']
                            query.action = action
                            query.save()
                            self.attempt.queries_count += 1

                            if 'explain' in q:
                                explain = Explain()
                                explain.output = q['explain']
                                explain.query = query
                                explain.save()

                            if 'stats' in q:
                                metric = QueryMetric()
                                metric.name = 'stats'
                                metric.value = str(q['stats'])
                                metric.query = query
                                metric.save()
                        except:
                            pass
                    for input in f['inputs']:
                        field = Field()
                        field.name = input['name']
                        field.type = input['type']
                        field.action = action
                        field.save()
                    for description, count in f['counter'].iteritems():
                        counter = Counter()
                        counter.description = description
                        counter.count = count
                        counter.action = action
                        counter.save()
                except Exception, e:
                    LOG.exception(e)  

        # save urls
        if urls != None:
            url_patterns = set()
            for u in urls:
                try:
                    if '/admin' in u['url']:
                        continue
                    url_pattern = re.sub('\d', '', u['url'])
                    if url_pattern in url_patterns:
                        continue
                    url_patterns.add(url_pattern)
                    action = Action()
                    action.url = u['url']
                    action.method = 'GET'
                    action.attempt = self.attempt
                    action.save()
                    self.attempt.actions_count += 1
                    for q in u['queries']:
                        try:
                            query = Query()
                            query.content = q['content']
                            query.action = action
                            query.save()
                            self.attempt.queries_count += 1

                            if 'explain' in q:
                                explain = Explain()
                                explain.output = q['explain']
                                explain.query = query
                                explain.save()

                            if 'stats' in q:
                                metric = QueryMetric()
                                metric.name = 'stats'
                                metric.value = str(q['stats'])
                                metric.query = query
                                metric.save()
                        except:
                            pass
                    for description, count in u['counter'].iteritems():
                        counter = Counter()
                        counter.description = description
                        counter.count = count
                        counter.action = action
                        counter.save()
                except Exception, e:
                    LOG.exception(e)  

        # save screenshot
        if screenshot_path != None:
            screenshot = open(screenshot_path, 'rb')
            image = Image()
            image.data = screenshot.read()
            image.attempt = self.attempt
            image.save()

        # save statistics
        if statistics != None:
            for description, count in statistics.iteritems():
                statistic = Statistic()
                statistic.description = description
                statistic.count = count
                statistic.attempt = self.attempt
                statistic.save()

        # save informations
        if informations != None:
            for name, description in informations.iteritems():
                information = Information()
                information.name = name
                information.description = description
                information.attempt = self.attempt
                information.save()

        LOG.info("Saved Attempt #%s for %s" % (self.attempt, self.attempt.repo))
        
        # populate packages
        for pkg in self.packages_from_file:
            try:
                Dependency.objects.get_or_create(attempt=self.attempt, package=pkg, source=PACKAGE_SOURCE_FILE)
                pkg.count = pkg.count + 1
                pkg.save()
            except Exception, e:
                LOG.exception(e)  
        ## FOR
        for pkg in self.packages_from_database:
            try:
                Dependency.objects.get_or_create(attempt=self.attempt, package=pkg, source=PACKAGE_SOURCE_DATABASE)
                if pkg.version != '':
                    pkg.count = pkg.count + 1
                    pkg.save()
            except Exception, e:
                LOG.exception(e)
        ## FOR

        # make sure we update the repo to point to this latest attempt
        if attempt_result in [ATTEMPT_STATUS_MISSING_REQUIRED_FILES, ATTEMPT_STATUS_RUNNING_ERROR, ATTEMPT_STATUS_DOWNLOAD_ERROR]:
            self.repo.valid_project = False
        else:
            self.repo.valid_project = True
        self.repo.latest_attempt = self.attempt
        if self.attempt.result == ATTEMPT_STATUS_SUCCESS and self.attempt.queries_count == 0:
            self.attempt.result = ATTEMPT_STATUS_NO_QUERIES
        if self.attempt.result == ATTEMPT_STATUS_SUCCESS:
            self.repo.latest_successful_attempt = self.attempt
        self.repo.attempts_count = self.repo.attempts_count + 1
        self.repo.save()
        self.attempt.save()

Example 58

Project: Glyphs-Scripts Source File: Batch Metric Keys.py
	def BatchMetricKeyMain( self, sender ):
		try:
			thisFont = Glyphs.font
			thisFontMaster = thisFont.selectedFontMaster
			listOfSelectedLayers = thisFont.selectedLayers
			fieldKey = self.w.keyTextField.get()
			flatFieldKey = re.sub("@Base", "@base", fieldKey)
	
			if "@base" in fieldKey or "@Base" in fieldKey:
				# Checks if a given layer has a metrics key of a glyph that has another key. Checks the glyph once and returns its name.
				def nestHuntL( targetGlyphName ):
					try:
						# Sees if the glyphName exists in the font
						if thisFont.glyphs[ targetGlyphName ]:
							# If exists, gets the left key of targetGlyph of the same layer
							targetGlyphL = thisFont.glyphs[ targetGlyphName ]
							targetLayerL = targetGlyphL.layers[ thisFontMaster.id ]
							targetLayerKeyL = targetLayerL.leftMetricsKeyUI()

							# If it's a plain number or calculation, returns the original glyph name
							a = ["=|", "+", "*", "/", "-1", "-2", "-3", "-4", "-5", "-6", "-7", "-8", "-9"]
							if targetLayerKeyL[0].isdigit() or "-" in targetLayerKeyL[0] or any([ x in targetLayerKeyL for x in a]):
								return targetGlyphName
		
							# Finds the first component and returns its name
							elif "auto" in targetLayerKeyL:
								firstComponent = targetLayerL.components[0]
								return firstComponent.componentName
		
							# This is a single-letter key, so clean it up
							else:
								cleanGlyphName = re.sub( "=", "", targetLayerKeyL )
								cleanGlyphName = re.sub( " .*", "", cleanGlyphName )
								return cleanGlyphName
		
						# If the glyph doesn't exist:
						else:
							print "Found invalid LSB key while checking the key of %s" % thisGlyph.name
					except Exception, e:
						Glyphs.showMacroWindow()
						print "nestHuntL Error: %s" % e
	
				def nestHuntR( targetGlyphName ):
					try:
						# Sees if the glyphName exists in the font
						if thisFont.glyphs[ targetGlyphName ]:
							# If exists, gets the left key of targetGlyph of the same layer
							targetGlyphR = thisFont.glyphs[ targetGlyphName ]
							targetLayerR = targetGlyphR.layers[ thisFontMaster.id ]
							targetLayerKeyR = targetLayerR.rightMetricsKeyUI()
							# If it's a plain number or calculation, returns the original glyph name
							a = ["=|", "+", "*", "/", "-1", "-2", "-3", "-4", "-5", "-6", "-7", "-8", "-9"]
							if targetLayerKeyR[0].isdigit() or "-" in targetLayerKeyR[0] or any([ x in targetLayerKeyR for x in a]):
								return targetGlyphName
		
							# Finds the last "Letter" component and returns its name
							elif "auto" in targetLayerKeyR:
								allCompernents = thisLayer.components 
								numOfCompernents = len(allCompernents)
								lastCompernent = allCompernents[numOfCompernents-1]
								lastCompernentName = lastCompernent.componentName
								lastCompernentGlyph = thisFont.glyphs[lastCompernentName]
								while lastCompernentGlyph.category != "Letter":
									numOfCompernents = numOfCompernents-1
									lastCompernent = allCompernents[numOfCompernents]
									lastCompernentName = lastCompernent.componentName
									lastCompernentGlyph = thisFont.glyphs[lastCompernentName]
								return lastCompernentName
		
							# This is a single-letter key, so clean it up
							else:
								cleanGlyphName = re.sub( "=", "", targetLayerKeyR )
								cleanGlyphName = re.sub( " .*", "", cleanGlyphName )
								return cleanGlyphName
		
						# If the glyph doesn't exist:
						else:
							print "Found invalid RSB key while checking the key of %s" % thisGlyph.name
					except Exception, e:
						Glyphs.showMacroWindow()
						print "nestHuntR Error: %s" % e
	
				# Set baseGlyphName for further nest hunting.
				for thisLayer in thisFont.selectedLayers:
					# Checks case of base glyph.
					thisGlyph = thisLayer.parent
					baseGlyphName = re.sub("\..*", "", thisGlyph.name)
					baseGlyphName = re.sub("superior", "", baseGlyphName)
					if "@Base" in fieldKey:
						baseGlyphName = baseGlyphName.capitalize()
						if thisGlyph.script == "latin" and re.match("Ij|Ae|Oe", baseGlyphName):
							baseGlyphName = baseGlyphName[0:2].upper() + baseGlyphName[2:]
							baseGlyphNameL = baseGlyphName
							baseGlyphNameR = baseGlyphName
					# Detects ligatures and sets baseGlyphNameL and R
					if "_" in baseGlyphName:
						baseGlyphNameL = re.sub("_.*", "", baseGlyphName)
						baseGlyphNameR = re.sub(".*_", "", baseGlyphName)
					elif "ordfeminine" in thisGlyph.name:
						baseGlyphNameL = "a"
						baseGlyphNameR = "a"
					elif "ordmasculine" in thisGlyph.name:
						baseGlyphNameL = "o"
						baseGlyphNameR = "o"
					else:
						baseGlyphNameL = baseGlyphName
						baseGlyphNameR = baseGlyphName
					thisFont.disableUpdateInterface()
					thisGlyph.beginUndo()	
	
					# Runs nestHuntL multiple times until it finds the final glyph,
					# and then set the final left metrics key.
					if self.w.applyL.get():
						if self.w.avoidNest:
							dummyOldL = nestHuntL(baseGlyphNameL)
							dummyNewL = nestHuntL(dummyOldL)
							while dummyOldL != dummyNewL:
								dummyOldL = nestHuntL(dummyNewL)
								dummyNewL = nestHuntL(dummyOldL)
							finalKeyL = re.sub("@base", dummyNewL, flatFieldKey)
							thisGlyph.setLeftMetricsKey_(finalKeyL)
	
					# Runs nestHuntR multiple times until it finds the final glyph,
					# and then set the final right metrics key.
					if self.w.applyR.get():
						if self.w.avoidNest:
							dummyOldR = nestHuntR(baseGlyphNameR)
							dummyNewR = nestHuntR(dummyOldR)
							while dummyOldR != dummyNewR:
								dummyOldR = nestHuntR(dummyNewR)
								dummyNewR = nestHuntR(dummyOldR)
							finalKeyR = re.sub("@base", dummyNewR, flatFieldKey)
		
							
							# Processes as normal
							if baseGlyphName != "Q":
								thisGlyph.setRightMetricsKey_(finalKeyR)
							# Uses width of the width of O of the same group
							elif baseGlyphName == "Q" and self.w.radioQ.get() == 0:
								Qbefore = thisGlyph.name
								Qname = re.sub("Q", "O", Qbefore)
								Qname = re.sub("q", "o", Qbefore)
								glyphO = thisFont.glyphs[Qname]
								numOfMasters = len(thisFont.masters)
								thisGlyph.setWidth_(thisOWidth)
							# Uses RSB as normal
							elif baseGlyphName == "Q" and self.w.radioQ.get() == 1:
								thisGlyph.setRightMetricsKey_(finalKeyR)
	
					thisGlyph.endUndo()
					thisFont.enableUpdateInterface()
				self.w.close()
	
			else:
				pass
				for thisLayer in listOfSelectedLayers:
					thisGlyph = thisLayer.parent
					thisFont.disableUpdateInterface()
					thisGlyph.beginUndo()	
					for i in thisGlyph.layers:
						if self.w.applyL.get():
							i.setLeftMetricsKey_(fieldKey)
						if self.w.applyR.get():
							i.setRightMetricsKey_(fieldKey)
					thisGlyph.endUndo()
					thisFont.enableUpdateInterface()
				self.w.close()
		except Exception, e:
			Glyphs.showMacroWindow()
			print "BatchMetricKeyMain Error: %s" % e

Example 59

Project: edx-platform Source File: formula.py
    def preprocess_pmathml(self, xml):
        r"""
        Pre-process presentation MathML from ASCIIMathML to make it more
        acceptable for SnuggleTeX, and also to accomodate some sympy
        conventions (eg hat(i) for \hat{i}).

        This method would be a good spot to look for an integral and convert
        it, if possible...
        """

        if isinstance(xml, (str, unicode)):
            xml = etree.fromstring(xml)		# TODO: wrap in try

        xml = self.fix_greek_in_mathml(xml)	 # convert greek utf letters to greek spelled out in ascii

        def gettag(expr):
            return re.sub('{http://[^}]+}', '', expr.tag)

        def fix_pmathml(xml):
            """
            f and g are processed as functions by asciimathml, eg "f-2" turns
            into "<mrow><mi>f</mi><mo>-</mo></mrow><mn>2</mn>" this is
            really terrible for turning into cmathml.  undo this here.
            """
            for k in xml:
                tag = gettag(k)
                if tag == 'mrow':
                    if len(k) == 2:
                        if gettag(k[0]) == 'mi' and k[0].text in ['f', 'g'] and gettag(k[1]) == 'mo':
                            idx = xml.index(k)
                            xml.insert(idx, deepcopy(k[0]))	 # drop the <mrow> container
                            xml.insert(idx + 1, deepcopy(k[1]))
                            xml.remove(k)
                fix_pmathml(k)

        fix_pmathml(xml)

        def fix_hat(xml):
            """
            hat i is turned into <mover><mi>i</mi><mo>^</mo></mover> ; mangle
            this into <mi>hat(f)</mi> hat i also somtimes turned into
            <mover><mrow> <mi>j</mi> </mrow><mo>^</mo></mover>
            """
            for k in xml:
                tag = gettag(k)
                if tag == 'mover':
                    if len(k) == 2:
                        if gettag(k[0]) == 'mi' and gettag(k[1]) == 'mo' and str(k[1].text) == '^':
                            newk = etree.Element('mi')
                            newk.text = 'hat(%s)' % k[0].text
                            xml.replace(k, newk)
                        if gettag(k[0]) == 'mrow' and gettag(k[0][0]) == 'mi' and \
                           gettag(k[1]) == 'mo' and str(k[1].text) == '^':
                            newk = etree.Element('mi')
                            newk.text = 'hat(%s)' % k[0][0].text
                            xml.replace(k, newk)
                fix_hat(k)
        fix_hat(xml)

        def flatten_pmathml(xml):
            """
            Give the text version of certain PMathML elements

            Sometimes MathML will be given with each letter separated (it
            doesn't know if its implicit multiplication or what). From an xml
            node, find the (text only) variable name it represents. So it takes
            <mrow>
              <mi>m</mi>
              <mi>a</mi>
              <mi>x</mi>
            </mrow>
            and returns 'max', for easier use later on.
            """
            tag = gettag(xml)
            if tag == 'mn':
                return xml.text
            elif tag == 'mi':
                return xml.text
            elif tag == 'mrow':
                return ''.join([flatten_pmathml(y) for y in xml])
            raise Exception('[flatten_pmathml] unknown tag %s' % tag)

        def fix_mathvariant(parent):
            """
            Fix certain kinds of math variants

            Literally replace <mstyle mathvariant="script"><mi>N</mi></mstyle>
            with 'scriptN'. There have been problems using script_N or script(N)
            """
            for child in parent:
                if gettag(child) == 'mstyle' and child.get('mathvariant') == 'script':
                    newchild = etree.Element('mi')
                    newchild.text = 'script%s' % flatten_pmathml(child[0])
                    parent.replace(child, newchild)
                fix_mathvariant(child)
        fix_mathvariant(xml)

        # find "tagged" superscripts
        # they have the character \u200b in the superscript
        # replace them with a__b so snuggle doesn't get confused
        def fix_superscripts(xml):
            """ Look for and replace sup elements with 'X__Y' or 'X_Y__Z'

            In the javascript, variables with '__X' in them had an invisible
            character inserted into the sup (to distinguish from powers)
            E.g. normal:
            <msubsup>
              <mi>a</mi>
              <mi>b</mi>
              <mi>c</mi>
            </msubsup>
            to be interpreted '(a_b)^c' (nothing done by this method)

            And modified:
            <msubsup>
              <mi>b</mi>
              <mi>x</mi>
              <mrow>
                <mo>&#x200B;</mo>
                <mi>d</mi>
              </mrow>
            </msubsup>
            to be interpreted 'a_b__c'

            also:
            <msup>
              <mi>x</mi>
              <mrow>
                <mo>&#x200B;</mo>
                <mi>B</mi>
              </mrow>
            </msup>
            to be 'x__B'
            """
            for k in xml:
                tag = gettag(k)

                # match things like the last example--
                # the second item in msub is an mrow with the first
                # character equal to \u200b
                if (
                        tag == 'msup' and
                        len(k) == 2 and gettag(k[1]) == 'mrow' and
                        gettag(k[1][0]) == 'mo' and k[1][0].text == u'\u200b'  # whew
                ):

                    # replace the msup with 'X__Y'
                    k[1].remove(k[1][0])
                    newk = etree.Element('mi')
                    newk.text = '%s__%s' % (flatten_pmathml(k[0]), flatten_pmathml(k[1]))
                    xml.replace(k, newk)

                # match things like the middle example-
                # the third item in msubsup is an mrow with the first
                # character equal to \u200b
                if (
                        tag == 'msubsup' and
                        len(k) == 3 and gettag(k[2]) == 'mrow' and
                        gettag(k[2][0]) == 'mo' and k[2][0].text == u'\u200b'    # whew
                ):

                    # replace the msubsup with 'X_Y__Z'
                    k[2].remove(k[2][0])
                    newk = etree.Element('mi')
                    newk.text = '%s_%s__%s' % (flatten_pmathml(k[0]), flatten_pmathml(k[1]), flatten_pmathml(k[2]))
                    xml.replace(k, newk)

                fix_superscripts(k)
        fix_superscripts(xml)

        def fix_msubsup(parent):
            """
            Snuggle returns an error when it sees an <msubsup> replace such
            elements with an <msup>, except the first element is of
            the form a_b. I.e. map a_b^c => (a_b)^c
            """
            for child in parent:
                # fix msubsup
                if gettag(child) == 'msubsup' and len(child) == 3:
                    newchild = etree.Element('msup')
                    newbase = etree.Element('mi')
                    newbase.text = '%s_%s' % (flatten_pmathml(child[0]), flatten_pmathml(child[1]))
                    newexp = child[2]
                    newchild.append(newbase)
                    newchild.append(newexp)
                    parent.replace(child, newchild)

                fix_msubsup(child)
        fix_msubsup(xml)

        self.xml = xml  # pylint: disable=attribute-defined-outside-init
        return self.xml

Example 60

Project: bakthat Source File: __init__.py
@app.cmd(help="Backup a file or a directory, backup the current directory if no arg is provided.")
@app.cmd_arg('filename', type=str, default=os.getcwd(), nargs="?")
@app.cmd_arg('-d', '--destination', type=str, help="s3|glacier|swift", default=None)
@app.cmd_arg('--prompt', type=str, help="yes|no", default="yes")
@app.cmd_arg('-t', '--tags', type=str, help="space separated tags", default="")
@app.cmd_arg('-p', '--profile', type=str, default="default", help="profile name (default by default)")
@app.cmd_arg('-c', '--config', type=str, default=CONFIG_FILE, help="path to config file")
@app.cmd_arg('-k', '--key', type=str, default=None, help="Custom key for periodic backups (works only with BakManager.io hook.)")
@app.cmd_arg('--exclude-file', type=str, default=None)
@app.cmd_arg('--s3-reduced-redundancy', action="store_true")
def backup(filename=os.getcwd(), destination=None, profile="default", config=CONFIG_FILE, prompt="yes", tags=[], key=None, exclude_file=None, s3_reduced_redundancy=False, **kwargs):
    """Perform backup.

    :type filename: str
    :param filename: File/directory to backup.

    :type destination: str
    :param destination: s3|glacier|swift

    :type prompt: str
    :param prompt: Disable password promp, disable encryption,
        only useful when using bakthat in command line mode.

    :type tags: str or list
    :param tags: Tags either in a str space separated,
        either directly a list of str (if calling from Python).

    :type password: str
    :keyword password: Password, empty string to disable encryption.

    :type conf: dict
    :keyword conf: Override/set AWS configuration.

    :type custom_filename: str
    :keyword custom_filename: Override the original filename (only in metadata)

    :rtype: dict
    :return: A dict containing the following keys: stored_filename, size, metadata, backend and filename.

    """
    storage_backend, destination, conf = _get_store_backend(config, destination, profile)
    backup_file_fmt = "{0}.{1}.tgz"

    session_id = str(uuid.uuid4())
    events.before_backup(session_id)

    # Check if compression is disabled on the configuration.
    if conf:
        compress = conf.get("compress", True)
    else:
        compress = config.get(profile).get("compress", True)

    if not compress:
        backup_file_fmt = "{0}.{1}"

    log.info("Backing up " + filename)

    if exclude_file and os.path.isfile(exclude_file):
        EXCLUDE_FILES.insert(0, exclude_file)

    _exclude = lambda filename: False
    if os.path.isdir(filename):
        join = functools.partial(os.path.join, filename)
        for efile in EXCLUDE_FILES:
            efile = join(efile)
            if os.path.isfile(efile):
                _exclude = _get_exclude(efile)
                log.info("Using {0} to exclude files.".format(efile))

    arcname = filename.strip('/').split('/')[-1]
    now = datetime.utcnow()
    date_component = now.strftime("%Y%m%d%H%M%S")
    stored_filename = backup_file_fmt.format(arcname, date_component)

    backup_date = int(now.strftime("%s"))
    backup_data = dict(filename=kwargs.get("custom_filename", arcname),
                       backup_date=backup_date,
                       last_updated=backup_date,
                       backend=destination,
                       is_deleted=False)

    # Useful only when using bakmanager.io hook
    backup_key = key

    password = kwargs.get("password", os.environ.get("BAKTHAT_PASSWORD"))
    if password is None and prompt.lower() != "no":
        password = getpass("Password (blank to disable encryption): ")
        if password:
            password2 = getpass("Password confirmation: ")
            if password != password2:
                log.error("Password confirmation doesn't match")
                return

    if not compress:
        log.info("Compression disabled")
        outname = filename
        with open(outname) as outfile:
            backup_data["size"] = os.fstat(outfile.fileno()).st_size
        bakthat_compression = False

    # Check if the file is not already compressed
    elif mimetypes.guess_type(arcname) == ('application/x-tar', 'gzip'):
        log.info("File already compressed")
        outname = filename

        # removing extension to reformat filename
        new_arcname = re.sub(r'(\.t(ar\.)?gz)', '', arcname)
        stored_filename = backup_file_fmt.format(new_arcname, date_component)

        with open(outname) as outfile:
            backup_data["size"] = os.fstat(outfile.fileno()).st_size

        bakthat_compression = False
    else:
        # If not we compress it
        log.info("Compressing...")

        with tempfile.NamedTemporaryFile(delete=False) as out:
            with closing(tarfile.open(fileobj=out, mode="w:gz")) as tar:
                tar.add(filename, arcname=arcname, exclude=_exclude)
            outname = out.name
            out.seek(0)
            backup_data["size"] = os.fstat(out.fileno()).st_size
        bakthat_compression = True

    bakthat_encryption = False
    if password:
        bakthat_encryption = True
        log.info("Encrypting...")
        encrypted_out = tempfile.NamedTemporaryFile(delete=False)
        encrypt_file(outname, encrypted_out.name, password)
        stored_filename += ".enc"

        # We only remove the file if the archive is created by bakthat
        if bakthat_compression:
            os.remove(outname)  # remove non-encrypted tmp file

        outname = encrypted_out.name

        encrypted_out.seek(0)
        backup_data["size"] = os.fstat(encrypted_out.fileno()).st_size

    # Handling tags metadata
    if isinstance(tags, list):
        tags = " ".join(tags)

    backup_data["tags"] = tags

    backup_data["metadata"] = dict(is_enc=bakthat_encryption,
                                   client=socket.gethostname())
    backup_data["stored_filename"] = stored_filename

    access_key = storage_backend.conf.get("access_key")
    container_key = storage_backend.conf.get(storage_backend.container_key)
    backup_data["backend_hash"] = hashlib.sha512(access_key + container_key).hexdigest()

    log.info("Uploading...")
    storage_backend.upload(stored_filename, outname, s3_reduced_redundancy=s3_reduced_redundancy)

    # We only remove the file if the archive is created by bakthat
    if bakthat_compression or bakthat_encryption:
        os.remove(outname)

    log.debug(backup_data)

    # Insert backup metadata in SQLite
    backup = Backups.create(**backup_data)

    BakSyncer(conf).sync_auto()

    # bakmanager.io hook, enable with -k/--key paramter
    if backup_key:
        bakmanager_hook(conf, backup_data, backup_key)

    events.on_backup(session_id, backup)

    return backup

Example 61

Project: KaraKara Source File: __init__.py
def main(global_config, **settings):
    """
        This function returns a Pyramid WSGI application.
    """
    # Setup --------------------------------------------------------------------

    # Db
    init_DBSession(settings)

    # Pyramid Global Settings
    config = Configurator(settings=settings)  # , autocommit=True

    # Register Aditional Includes ---------------------------------------------
    config.include('pyramid_mako')  # The mako.directories value is updated in the scan for addons. We trigger the import here to include the correct folders.

    # Reload on template change
    template_filenames = map(operator.attrgetter('absolute'), file_scan(config.registry.settings['mako.directories']))
    add_file_callback(lambda: template_filenames)

    # Parse/Convert setting keys that have specifyed datatypes
    for key in config.registry.settings.keys():
        config.registry.settings[key] = convert_str_with_type(config.registry.settings[key])

    # i18n
    config.add_translation_dirs(config.registry.settings['i18n.translation_dirs'])

    # Session Manager
    session_settings = extract_subkeys(config.registry.settings, 'session.')
    session_factory = SignedCookieSessionFactory(serializer=json_serializer, **session_settings)
    config.set_session_factory(session_factory)

    # Cachebust etags ----------------------------------------------------------
    #  crude implementation; count the number of tags in db, if thats changed, the etags will invalidate
    if not config.registry.settings['server.etag.cache_buster']:
        from .model.actions import last_update
        config.registry.settings['server.etag.cache_buster'] = 'last_update:{0}'.format(str(last_update()))

    # Search Config ------------------------------------------------------------
    import karakara.views.search
    karakara.views.search.search_config = read_json(config.registry.settings['karakara.search.view.config'])

    # WebSocket ----------------------------------------------------------------

    class NullAuthEchoServerManager(object):
        def recv(self, *args, **kwargs):
            pass
    socket_manager = NullAuthEchoServerManager()

    # Do not activate websocket if in community mode
    if config.registry.settings.get('karakara.server.mode') == 'comunity':
        config.registry.settings['karakara.websocket.port'] = None

    if config.registry.settings.get('karakara.websocket.port'):
        def authenicator(key):
            """Only admin authenticated keys can connect to the websocket"""
            request = Request({'HTTP_COOKIE':'{0}={1}'.format(config.registry.settings['session.cookie_name'],key)})
            session_data = session_factory(request)
            return session_data and session_data.get('admin')
        try:
            _socket_manager = AuthEchoServerManager(
                authenticator=authenicator,
                websocket_port=config.registry.settings['karakara.websocket.port'],
                tcp_port=config.registry.settings.get('karakara.tcp.port'),
            )
            _socket_manager.start()
            socket_manager = _socket_manager
        except OSError:
            log.warn('Unable to setup websocket')

    config.registry['socket_manager'] = socket_manager


    # Login Providers ----------------------------------------------------------

    from .views.comunity_login import social_login
    social_login.user_store = ComunityUserStore()
    login_providers = config.registry.settings.get('login.provider.enabled')
    # Facebook
    if 'facebook' in login_providers:
        for settings_key in ('facebook.appid', 'facebook.secret'):
            assert config.registry.settings.get(settings_key), 'To use facebook as a login provider appid and secret must be provided'
        social_login.add_login_provider(FacebookLogin(
            appid=config.registry.settings.get('facebook.appid'),
            secret=config.registry.settings.get('facebook.secret'),
            permissions=config.registry.settings.get('facebook.permissions'),
        ))
    # Firefox Persona (Deprecated technology but a useful reference)
    #if 'persona' in login_providers:
    #    social_login.add_login_provider(PersonaLogin(
    #        site_url=config.registry.settings.get('server.url')
    #    ))
    # No login provider
    if not login_providers and config.registry.settings.get('karakara.server.mode') == 'development':
        # Auto login if no service keys are provided
        social_login.add_login_provider(NullLoginProvider())
        social_login.user_store = NullComunityUserStore()
    template_helpers.javascript_inline['comunity'] = social_login.html_includes

    # Renderers ----------------------------------------------------------------

    # AllanC - currently the auto_format decorator does all the formatting work
    #          it would be far preferable to use the pyramid renderer framework
    #          issue is, we want to set the renderer to be dynamic based on the url given
    #          I don't want to define EVERY method with loads of renderer tags
    #          and I don't want to define 5+ routes for every view callable with differnt formats
    #          We need a nice way of doing this in pyramid, and right now, after HOURS of trawling
    #          the doc and experimenting, I cant find one.
    #from .renderers.auto_render_factory import AutoRendererFactory, handle_before_render
    #config.add_renderer(None   , AutoRendererFactory) #'renderers.auto_render_factory.auto_renderer_factory'
    #config.add_renderer('.html', 'pyramid.mako_templating.renderer_factory')
    #config.add_subscriber(handle_before_render , pyramid.events.BeforeRender) # maybe use this to set renderer?
    # closeset ive seen
    #   http://zhuoqiang.me/a/restful-pyramid
    #   http://stackoverflow.com/questions/4633320/is-there-a-better-way-to-switch-between-html-and-json-output-in-pyramid


    # Routes -------------------------------------------------------------------

    # Static Routes
    config.add_static_view(name='ext'   , path='../externals/static') #cache_max_age=3600 # settings["static.assets"]
    config.add_static_view(name='static', path='karakara:{0}'.format(settings["static.assets"])) #cache_max_age=3600 # settings["static.assets"]
    config.add_static_view(name='player', path=settings["static.player"])

    # AllanC - it's official ... static route setup and generation is a mess in pyramid
    #config.add_static_view(name=settings["static.media" ], path="karakara:media" )
    config.add_static_view(name='files' , path=settings["static.processed"])

    # Routes
    def append_format_pattern(route):
        return re.sub(r'{(.*)}', r'{\1:[^/\.]+}', route) + r'{spacer:[.]?}{format:(%s)?}' % '|'.join(registered_formats())

    config.add_route('home'          , append_format_pattern('/')              )
    config.add_route('track'         , append_format_pattern('/track/{id}')    )
    config.add_route('track_list'    , append_format_pattern('/track_list')    )
    config.add_route('queue'         , append_format_pattern('/queue')         )
    config.add_route('priority_tokens', append_format_pattern('/priority_tokens'))
    config.add_route('fave'          , append_format_pattern('/fave')          )
    config.add_route('message'       , append_format_pattern('/message')          )
    config.add_route('admin_toggle'  , append_format_pattern('/admin')         )
    config.add_route('admin_lock'    , append_format_pattern('/admin_lock')    )
    config.add_route('remote'        , append_format_pattern('/remote')        )
    config.add_route('feedback'      , append_format_pattern('/feedback')      )
    config.add_route('settings'      , append_format_pattern('/settings')      )
    config.add_route('random_images' , append_format_pattern('/random_images') )
    config.add_route('inject_testdata' , append_format_pattern('/inject_testdata') )
    config.add_route('stats'         , append_format_pattern('/stats')         )
    config.add_route('comunity'      , append_format_pattern('/comunity')      )
    config.add_route('comunity_login', append_format_pattern('/comunity/login'))
    config.add_route('comunity_logout', append_format_pattern('/comunity/logout'))
    config.add_route('comunity_list' , append_format_pattern('/comunity/list') )
    config.add_route('comunity_track', append_format_pattern('/comunity/track/{id}'))
    config.add_route('comunity_upload', append_format_pattern('/comunity/upload'))
    config.add_route('comunity_settings', append_format_pattern('/comunity/settings'))
    config.add_route('comunity_processmedia_log', append_format_pattern('/comunity/processmedia_log'))

    config.add_route('search_tags'   , '/search_tags/{tags:.*}')
    config.add_route('search_list'   , '/search_list/{tags:.*}')

    # Upload extras -----
    #config.add_static_view(name=settings['upload.route.uploaded'], path=settings['upload.path'])  # the 'upload' route above always matchs first
    config.add_route('upload', '/upload{sep:/?}{name:.*}')

    # Events -------------------------------------------------------------------
    config.add_subscriber(add_localizer_to_request, pyramid.events.NewRequest)
    config.add_subscriber(add_render_globals_to_template, pyramid.events.BeforeRender)

    # Return -------------------------------------------------------------------
    config.scan(ignore='.tests')
    config.scan('externals.lib.pyramid_helpers.views')
    return config.make_wsgi_app()

Example 62

Project: system-config-printer Source File: ppds.py
    def getPPDNamesFromDeviceID (self, mfg, mdl, description="",
                                 commandsets=None, uri=None,
                                 make_and_model=None):
        """
	Obtain a best-effort PPD match for an IEEE 1284 Device ID.

	@param mfg: MFG or MANUFACTURER field
	@type mfg: string
	@param mdl: MDL or MODEL field
	@type mdl: string
	@param description: DES or DESCRIPTION field, optional
	@type description: string
	@param commandsets: CMD or COMMANDSET field, optional
	@type commandsets: string
	@param uri: device URI, optional (only needed for debugging)
	@type uri: string
        @param make_and_model: device-make-and-model string
        @type make_and_model: string
	@returns: a dict of fit (string) indexed by PPD name
	"""
        _debugprint ("\n%s %s" % (mfg, mdl))
        orig_mfg = mfg
        orig_mdl = mdl
        self._init_ids ()

        if commandsets is None:
            commandsets = []

        # Start with an empty result list and build it up using
        # several search methods, in increasing order of fuzziness.
        fit = {}

        # First, try looking up the device using the manufacturer and
        # model fields from the Device ID exactly as they appear (but
        # case-insensitively).
        mfgl = mfg.lower ()
        mdll = mdl.lower ()

        id_matched = False
        try:
            for each in self.ids[mfgl][mdll]:
                fit[each] = self.FIT_EXACT
            id_matched = True
        except KeyError:
            pass

        # The HP PPDs say "HP" not "Hewlett-Packard", so try that.
        if mfgl == "hewlett-packard":
            try:
                for each in self.ids["hp"][mdll]:
                    fit[each] = self.FIT_EXACT
                print ("cuem Incorrect IEEE 1284 Device ID: %s" %
                       self.ids["hp"][mdll])
                print ("**** Actual ID is MFG:%s;MDL:%s;" % (mfg, mdl))
                print ("**** Please report a bug against the HPLIP component")
                id_matched = True
            except KeyError:
                pass

        # Now try looking up the device by ppd-make-and-model.
        _debugprint ("Trying make/model names")
        mdls = None
        self._init_makes ()
        make = None
        if mfgl == "":
            (mfg, mdl) = ppdMakeModelSplit (mdl)
            mfgl = normalize (mfg)
            mdll = normalize (mdl)

        _debugprint ("mfgl: %s" % mfgl)
        _debugprint ("mdll: %s" % mdll)
        mfgrepl = {"hewlett-packard": "hp",
                   "lexmark international": "lexmark",
                   "kyocera": "kyocera mita"}
        if mfgl in self.lmakes:
            # Found manufacturer.
            make = self.lmakes[mfgl]
        elif mfgl in mfgrepl:
            rmfg = mfgrepl[mfgl]
            if rmfg in self.lmakes:
                mfg = rmfg
                mfgl = mfg
                # Found manufacturer (after mapping to canonical name)
                _debugprint ("remapped mfgl: %s" % mfgl)
                make = self.lmakes[mfgl]

        _debugprint ("make: %s" % make)
        if make is not None:
            mdls = self.makes[make]
            mdlsl = self.lmodels[normalize(make)]

            # Remove manufacturer name from model field
            for prefix in [mfgl, 'hewlett-packard', 'hp']:
                if mdll.startswith (prefix + ' '):
                    mdl = mdl[len (prefix) + 1:]
                    mdll = normalize (mdl)
                    _debugprint ("unprefixed mdll: %s" % mdll)

            if mdll in self.lmodels[mfgl]:
                model = mdlsl[mdll]
                for each in mdls[model].keys ():
                    fit[each] = self.FIT_EXACT
                    _debugprint ("%s: %s" % (fit[each], each))
            else:
                # Make use of the model name clean-up in the
                # ppdMakeModelSplit () function
                (mfg2, mdl2) = ppdMakeModelSplit (mfg + " " + mdl)
                mdl2l = normalize (mdl2)
                _debugprint ("re-split mdll: %s" % mdl2l)
                if mdl2l in self.lmodels[mfgl]:
                    model = mdlsl[mdl2l]
                    for each in list(mdls[model].keys ()):
                        fit[each] = self.FIT_EXACT
                        _debugprint ("%s: %s" % (fit[each], each))
      
        if not fit and mdls:
            (s, ppds) = self._findBestMatchPPDs (mdls, mdl)
            if s != self.FIT_NONE:
                for each in ppds:
                    fit[each] = s
                    _debugprint ("%s: %s" % (fit[each], each))

        if commandsets:
            if type (commandsets) != list:
                commandsets = commandsets.split (',')

            _debugprint ("Checking CMD field")
            generic = self._getPPDNameFromCommandSet (commandsets)
            if generic:
                for driver in generic:
                    fit[driver] = self.FIT_GENERIC
                    _debugprint ("%s: %s" % (fit[driver], driver))

        # What about the CMD field of the Device ID?  Some devices
        # have optional units for page description languages, such as
        # PostScript, and they will report different CMD strings
        # accordingly.
        #
        # By convention, if a PPD contains a Device ID with a CMD
        # field, that PPD can only be used whenever any of the
        # comma-separated words in the CMD field appear in the
        # device's ID.
        # (See Red Hat bug #630058).
        #
        # We'll do that check now, and any PPDs that fail
        # (e.g. PostScript PPD for non-PostScript printer) can be
        # eliminated from the list.
        #
        # The reason we don't do this check any earlier is that we
        # don't want to eliminate PPDs only to have the fuzzy matcher
        # add them back in afterwards.
        #
        # While doing this, any drivers that we can positively confirm
        # as using a command set understood by the printer will be
        # converted from FIT_EXACT to FIT_EXACT_CMD.
        if id_matched and len (commandsets) > 0:
            failed = set()
            exact_cmd = set()
            for ppdname in fit.keys ():
                ppd_cmd_field = None
                ppd = self.ppds[ppdname]
                ppd_device_id = _singleton (ppd.get ('ppd-device-id'))
                if ppd_device_id:
                    ppd_device_id_dict = parseDeviceID (ppd_device_id)
                    ppd_cmd_field = ppd_device_id_dict["CMD"]

                if (not ppd_cmd_field and
                    # ppd-type is not reliable for driver-generated
                    # PPDs (see CUPS STR #3720).  Neither gutenprint
                    # nor foomatic specify ppd-type in their CUPS
                    # drivers.
                    ppdname.find (":") == -1):
                    # If this is a PostScript PPD we know which
                    # command set it will use.
                    ppd_type = _singleton (ppd.get ('ppd-type'))
                    if ppd_type == "postscript":
                        ppd_cmd_field = ["POSTSCRIPT"]

                if not ppd_cmd_field:
                    # We can't be sure which command set this driver
                    # uses.
                    continue

                usable = False
                for pdl in ppd_cmd_field:
                    if pdl in commandsets:
                        usable = True
                        break

                if usable:
                    exact_cmd.add (ppdname)
                else:
                    failed.add (ppdname)

            # Assign the more specific fit "exact-cmd" to those that
            # positively matched the CMD field.
            for each in exact_cmd:
                if fit[each] == self.FIT_EXACT:
                    fit[each] = self.FIT_EXACT_CMD
                    _debugprint (self.FIT_EXACT_CMD + ": %s" % each)

            if len (failed) < len ([d for (d, m) in fit.items ()
                                    if m != 'generic']):
                _debugprint ("Removed %s due to CMD mis-match" % failed)
                for each in failed:
                    del fit[each]
            else:
                _debugprint ("Not removing %s " % failed +
                             "due to CMD mis-match as it would "
                             "leave nothing good")

        if not fit:
            fallbacks = ["textonly.ppd", "postscript.ppd"]
            found = False
            for fallback in fallbacks:
                _debugprint ("'%s' fallback" % fallback)
                fallbackgz = fallback + ".gz"
                for ppdpath in self.ppds.keys ():
                    if (ppdpath.endswith (fallback) or
                        ppdpath.endswith (fallbackgz)):
                        fit[ppdpath] = self.FIT_NONE
                        found = True
                        break

                if found:
                    break

                _debugprint ("Fallback '%s' not available" % fallback)

            if not found:
                _debugprint ("No fallback available; choosing any")
                fit[list(self.ppds.keys ())[0]] = self.FIT_NONE

        if not id_matched:
            sanitised_uri = re.sub (pattern="//[^@]*@/?", repl="//",
                                    string=str (uri))
            try:
                cmd = reduce (lambda x, y: x + ","+ y, commandsets)
            except TypeError:
                cmd = ""
            id = "MFG:%s;MDL:%s;" % (orig_mfg, orig_mdl)
            if cmd:
                id += "CMD:%s;" % cmd
            if description:
                id += "DES:%s;" % description

            print ("No ID match for device %s:" % sanitised_uri)
            print (id)

        return fit

Example 63

Project: inspectors-general Source File: doj.py
def extract_info(content, directory, year_range):
  # goes through each agency or content bucket
  if directory in not_agency:
    agency = "doj"
    agency_name = "Department of Justice"
  elif directory[:11] == "Immigration":
    agency = "ins"
    agency_name = "Immigration and Naturalization Service"
  else:
    agency = agency_decoder[directory][1]
    agency_name = agency_decoder[directory][0]

  # there can be multiple reports per blurb
  blurbs = content[-1].find_all("p")
  report_count = 0

  for b in blurbs:
    # date
    # finding new dates that are just above the old ones

    # this is the format of the newest entries and the easiest to get
    x = b.previous_sibling
    y = b.previous_sibling.previous_sibling
    if isinstance(y, Tag) and y.get('class') == ['date']:
      date_string = y.string
    elif isinstance(x, Tag) and x.get('class') == ['date']:
      date_string = x.string
    else:
      date_string = None

    # finding older dates that are at the end of the text
    if date_string == None:
      try:
        date_string = b.get_text()
      except:
        date_string = None

    if date_string is not None:
      # get rid of extra stuff that is not the date
      date_text = re.sub(r'\([^)]*\)', '', date_string)
      date_text = re.sub(r'\[(.*?)\]', '', date_text)
      # chop up the string, the last part should be the date
      date_chopped = date_text.rsplit(',')
      day = date_chopped[-1]
      # ATF added dashes
      if "-" in day:
        date_chopped = date_text.rsplit('-')
        day = date_chopped[0]
      # cleaning
      date_string = day.strip()
      date_string = date_string.replace("  ", " ")
      day = day.strip()

      # this is a date written out with a comma
      if day.isdigit():
        date_string = date_chopped[-2] + "," + date_chopped[-1]

    # check for missing commas
    try:
      date_string = datetime.strptime(date_string, "%B %d %Y")
      date_string = datetime.strftime(date_string, "%B %d, %Y")
    except ValueError:
      pass

    # for dates without a day
    if date_string is not None:
      date_string = date_string.strip()
      if "," not in date_string:
        date_test = date_string.replace(" ", " 1, ")
        try:
          datetime.strptime(date_test, "%B %d, %Y")
          date_string = date_test
        except ValueError:
          pass

    # going through each link in a paragraph
    for l in b.find_all("a"):
      date = None
      real_title = None
      # most cases pass this test
      try:
        date = datetime.strptime(date_string, "%B %d, %Y")
      # these ones got to a coding purgatory called odd_link
      except ValueError:
        info = odd_link(b, date_string, l, directory, )
        # this should give better titles than "pdf" or "Press Release"
        real_title = info["real_title"]
        date_string = info["date_string"]
        # these are links to things that are not reports
        if real_title == False and date_string == False:
          break
        elif "," not in date_string:
          date_string = date_string.strip()
          date_string = date_string.replace(" ", " 1, ")
          date = datetime.strptime(date_string, "%B %d, %Y")

      if date is None:
        date = datetime.strptime(date_string, "%B %d, %Y")

      report_year = datetime.strftime(date, "%Y")
      published_on = datetime.strftime(date, "%Y-%m-%d")

      # trying to get the most descriptive title
      # I go from the best methods to fall back and override exceptions
      try:
        string_title = b.text
      except:
        string_title = b.string

      if string_title == None:
        string_title = b.contents
        if "<a href=" in str(string_title):
          string_title = b.contents[0]

      link = l.get("href")
      link = strip_url_fragment(link)
      if link != None:
        # title
        try:
          title = l.text
        except:
          title = l.string
        if title in ("HTML", "PDF", "Executive Summary", "Full Report"):
          title = string_title

        # in some cases the title is a heading a few elements up this gets passed in odd link
        if real_title is not None:
          title = real_title

        if title == 'id="content" name="content">':
          title =  b.string
          if title == None:
            title = b.text

        try:
          title = title.strip()
          title = title.replace('\n', "")
          title = title.replace('\r', "")
        except:
          pass

        file_type = find_file_type(link)
        if file_type == None or title == False:
          break

        if title == None:
          title = b.string

        # formating links consistently
        link = urljoin(base_url, link)
        # id
        doc_id = os.path.splitext(urlparse(link).path)[0]

        #these docs are one report where the page has a table of contents with links to content
        if "/index" in link:
          indexed = True
        else:
          indexed = False

        # creating ids
        # there may be a better way to do this but I am just taking out all the things that are not the id
        url_extras = ( "/final", "/fullpdf", "/ins_response", "oig/special/", "USMS/", "plus/", "oig/grants/", "oig/reports/", "EOUSA/", "BOP/", "ATF/", "COPS/", "FBI/", "OJP/", "INS/", "DEA/", "OBD", "/analysis", "/report", "/PDF_list", "/full_report", "/full", "_redacted", "oig", "r-", "/response", "/listpdf", "/memo", "/fullreport", "/Final", "/extradition", "/oig", "/grants", "/index")
        for n in url_extras:
          if n in doc_id:
            doc_id = doc_id.replace(n, "")

        while doc_id[:1] == "/":
          doc_id = doc_id[1:]

        year_match = YEAR_RE.match(doc_id)
        if year_match:
          doc_id = year_match.group(1)

        ag_match = AG_RE.match(link)
        if ag_match:
          doc_id = ag_match.group(1)

        # if it's still got slashes, just turn them into dashes
        # the ol' slash and dash
        doc_id = doc_id.replace("/", "-")

        # some weird issues I hard coded
        special_cases = {"a0118/au0118":"a0118", "a0207/0207":"a0207"}
        if doc_id in special_cases:
          doc_id = special_cases[doc_id]

        if "spanish" in link:
          language = "Spanish"
        else:
          language = "English"

        report_count += 1

        # if we're filtering on a year, and this isn't in it, skip it
        if int(report_year) not in year_range:
          # print("Skipping report for %s..." % report_year)
          continue

        if doc_id in report:
          if file_type == "pdf":
            # current and previous file pdf
            if report[doc_id]["file_type"] == "pdf":
              report[doc_id]["categories"].append(directory)
            # current file a pdf, old file html
            else:
              report[doc_id]["file_type"] = "pdf"
              report[doc_id]["url"] = link
              report[doc_id]["categories"].append(directory)
          else:
            # current file html old file pdf OR both files html
            report[doc_id]["categories"].append(directory)

          # add url if new
          old_url = False
          for n in report[doc_id]["urls"]:
            if link in n:
              old_url = True
          if not old_url:
            report[doc_id]["urls"].append({
              "url": link,
              "file_type": file_type,
              "indexed": indexed,
            })

          # finding the most descriptive name for cross-listed docs
          if report[doc_id]["agency"] == "doj" and agency != "doj":
            report[doc_id]["agency"] = agency
            report[doc_id]["agency_name"] = agency_name

        # Adding new docuement
        else:
          report[doc_id] = {
            "report_id": doc_id,
            "inspector": "doj",
            "inspector_url": "https://oig.justice.gov/reports/",
            "agency": agency,
            "agency_name": agency_name,
            "url": link,
            "title": title,
            "file_type": file_type,
            "categories": [directory,],
            "urls": [{
                "url": link,
                "file_type": file_type,
                "indexed": indexed,
            }],
            "published_on": published_on,
            # perhaps elaborate on this later
            "type": type_for(title),
            "language": language,
          }

  if report_count == 0:
    raise inspector.NoReportsFoundError("DOJ (%s)" % directory)

Example 64

Project: cgat Source File: run_nubiscan.py
Function: main
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-i", "--iterations", dest="iterations", type="int",
                      help="number of iterations for sampling [default=%default].")

    parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float",
                      help="qvalue threshold [default=%default].")

    parser.add_option("--without-combine", dest="combine", action="store_false",
                      help="combine overlapping motifs [default=%default].")

    parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice",
                      choices=("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default].")

    parser.add_option("-m", "--motif", dest="motif", type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default].")

    parser.add_option("-a", "--arrangements", dest="arrangements", type="string",
                      help="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x", "--mask", dest="mask", type="choice",
                      choices=("dust", "repeatmasker"),
                      help ="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats", dest="output_stats", action="store_true",
                      help="output stats [default=%default].")

    parser.add_option("--add-sequence", dest="add_sequence", action="store_true",
                      help="add sequence information [default=%default].")

    parser.set_defaults(
        iterations=100,
        qvalue_threshold=0.05,
        motif="rxrvdr",
        fdr_control="all",
        combine=True,
        arrangements=None,
        mask=None,
        output_stats=False,
        add_sequence=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements is None:
        options.arrangements = [
            "DR%s" % x for x in range(0, 15)] + ["ER%s" % x for x in range(0, 15)]
    else:
        options.arrangements = options.arrangements.split(",")

    options.stdout.write("%s" % "\t".join(Nubiscan.NubiscanMatch._fields))
    if options.add_sequence:
        options.stdout.write("\tsequence")
    options.stdout.write("\n")

    if options.motif == 'nr':
        sense_matrix = NR
    elif options.motif == "rxrvdr":
        sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1":
        sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2":
        sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences(
                [x.sequence for x in seqs], options.mask)
        else:
            masked_seqs = [x.sequence for x in seqs]

        ninput = len(seqs)
        map_id2title = dict(
            enumerate([re.sub("\s.*", "", x.title) for x in seqs]))
        matcher = Nubiscan.MatcherRandomisationSequences(sense_matrix,
                                                         samples=options.iterations)

        results = matcher.run(masked_seqs,
                              options.arrangements,
                              qvalue_threshold=options.qvalue_threshold)

        if options.combine:
            results = Nubiscan.combineMotifs(results)

        for r in results:

            if r.alternatives:
                alternatives = ",".join(
                    [x.arrangement for x in r.alternatives])
            else:
                alternatives = ""

            options.stdout.write("\t".join((
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives)))

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-":
                    s = Genomics.complement(s)
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write("\t%s" % s)

            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile("fdr")
            outfile.write("bin\thist\tnobserved\n")
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write("%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()

    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence(sense_matrix,
                                                        samples=options.iterations)

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run(seq.sequence,
                             options.arrangements,
                             qvalue_threshold=None)
            for m in mm:
                matches.append(m._replace(sequence=seq.title))

        # estimate qvalues for all matches across all sequences
        pvalues = [x.pvalue for x in matches]
        fdr = Stats.doFDR(pvalues)
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold:
                continue
            results.append(m._replace(qvalue=qvalue))

        if options.combine:
            results = Nubiscan.combineMotifs(results)

        # output
        for r in results:
            options.stdout.write("\t".join((
                r.id,
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue)) + "\n")

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence(sense_matrix,
                                                        samples=options.iterations)

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run(seq.sequence,
                                 options.arrangements,
                                 qvalue_threshold=options.qvalue_threshold)

            if options.combine:
                result = Nubiscan.combineMotifs(result)

            t = re.sub(" .*", "",  seq.title)
            for r in result:
                options.stdout.write("\t".join((
                    t,
                    "%i" % r.start,
                    "%i" % r.end,
                    r.strand,
                    r.arrangement,
                    "%6.4f" % r.score,
                    "%6.4f" % r.zscore,
                    "%f" % r.pvalue,
                    "%f" % r.qvalue)) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()

Example 65

Project: WIPSTER Source File: crits.py
def submit_to_crits(post_data, last_sample, crits_ta, savename=""):

    crits_result = {}
    crits_str_result = ""
    crits_upload_dict = {}
    data = {}
    final_data = {}
    search_res = {}

    for k, v in post_data.iteritems():
        if "chk" in k and v=="on":
            chk_input_key = re.sub("_chk", "", k)
            if chk_input_key in post_data : #Make sure there's an input that matches with the checkbox

                #Create or clear the dict if it already exists
                data, final_data = clear_upload_dicts(data, final_data)
                search_res.clear()

                if "_domain_" in chk_input_key:
                    data['type'] = "domain"
                elif "_ip_" in chk_input_key:
                    data['type'] = "ip"
                elif "_vt_" in chk_input_key:
                    data['type'] = "vt"
                elif "_command_" in chk_input_key:
                    data['type'] = "command"
                elif "_ua_" in chk_input_key:
                    data['type'] = "ua"
                else:
                    data['type'] = "event"

                data['val'] = post_data[chk_input_key]
                if not data['val']: #If the input is empty, check the next form box
                    continue

                if "ta_" in chk_input_key:
                    data['ta'] = True
                else:
                    data['ta'] = False

                # Search if the object already exists. If  it does, pull in the JSON, otherwise, add it to CRITs

                data['search'] = data['val']

                search_res = search_crits(data)

                # Set types for relationships later on
                if data['type'] == "domain":
                    crits_type = "Domain"
                elif data['type'] == "ip":
                    crits_type = "IP"
                else:
                    crits_type = "Event"


                if search_res['objects']: # If result found

                    if data['type'] not in crits_upload_dict: # If a list for that type does not yet exist, create it
                        crits_upload_dict[data['type']] = []
 
                    crits_upload_dict[data['type']].append({"id": search_res['objects'][0]['_id'],
                                                                     "type": crits_type})


                else: # If no result found in search, add it to CRITs

                    final_data = build_data(data, last_sample)
                    crits_upload_res = upload_object(final_data)
                    crits_str_result += "uploaded " + data['type'] + "\r\n\r\n" + str(crits_upload_res) + "\r\n\r\ncuem*********\r\n\r\n"


                    if data['type'] not in crits_upload_dict:  # If a list of that type does not yet exist, create it
                        crits_upload_dict[data['type']] = []

                    crits_upload_dict[data['type']].append({"id": crits_upload_res['id'],
                                                            "type": crits_type})
                    

    #################################################
    #### Handle Uploading the Ticket as an Event ####
    #################################################

    #Create or clear the dict if it already exists
    data, final_data = clear_upload_dicts(data, final_data)
    search_res.clear()

    data['type'] = "ticket"

    # Search if the ticket already exists. If it does, pull in the JSON, otherwise, add it to CRITs
    data['search'] = last_sample.ticket
    search_res = search_crits(data)

    if 'objects' in search_res: # If an event with this Ticket # is found to exist, use its existing ID
        if search_res['objects']:
            crits_upload_dict['ticket'] = [{'id': search_res['objects'][0]['_id'],
                                            'type': 'Event'}]
#        crits_upload_dict['ticket'][0]['id'] = search_res['objects'][0]['_id']
#        crits_upload_dict['ticket'][0]['type'] = 'Event'
        else: # Otherwise, upload it
            final_data = build_data(data, last_sample)
            crits_upload_res = upload_object(final_data)

            crits_upload_dict['ticket'] = [{'id': crits_upload_res['id'],
                                            'type': 'Event'}]
            crits_str_result += "\r\nUploaded Ticket: " + str(crits_upload_dict['ticket'][0]) + "\r\n\r\n***********************\r\n"


    ############################################
    #### Handle uploading the sample itself ####
    ############################################

    #Create or clear the dict if it already exists
    data, final_data = clear_upload_dicts(data, final_data)
    search_res.clear()

    data['type'] = "sample"

    # Search if the sample already exists. If it does, pull the JSON, otherwise, add it to CRITs
    data['search'] = last_sample.md5
    search_res = search_crits(data)

    if 'objects' in search_res:
        if search_res['objects']:
            crits_upload_dict['sample'] = [{'id': search_res['objects'][0]['_id'],
                                            'type': 'Sample'}]

        else:
            # Need to handle renaming the sample to remove the .MAL when adding to CRITs
            # Before calling build_data()

            savename = "sanalysis/static/"+savename
            newname = threatanalyzer.remove_mal(savename) # Copy the file without .MAL - Removed later in main method
            
            final_data = build_data(data, last_sample, newname=newname)
            crits_upload_res = upload_object(final_data)

            rem_tmp = threatanalyzer.remove_tmp_file(newname) # Remove the copy of the file that doesn't have .MAL


            crits_upload_dict['sample'] = [{'id': crits_upload_res['id'],
                                            'type': 'Sample'}]

            crits_str_result += "\r\nUploaded Sample: \r\n" + str(crits_upload_dict['sample'][0]) + "\r\n\r\n****************\r\n\r\n"

    ########################################################
    #### Handle uploading metadata of any dropped files ####
    ########################################################
    
    if 'crits_dropped' in crits_ta:
        crits_upload_dict['sample_metadata'] = []
        for dropped in crits_ta['crits_dropped']:
        
            data, final_data = clear_upload_dicts(data, final_data)
            search_res.clear()
            
            data['type'] = "sample_metadata"
            data['val'] = dropped
            
            data['search'] = dropped['md5']
            search_res = search_crits(data)
            
            if 'objects' in search_res:
                if search_res['objects']:
                    crits_upload_dict['sample_metadata'].append({'id': search_res['objects'][0]['_id'],
                                                             'type': 'Sample'})
                                                         
                else:
                    final_data = build_data(data, last_sample)
                    crits_upload_res = upload_object(final_data)
                    
                    crits_upload_dict['sample_metadata'].append({'id': crits_upload_res['id'],
                                                             'type': 'Sample'})
                    crits_str_result += "\r\nUploaded Sample MetaData: \r\n" + str(crits_upload_dict['sample_metadata'][-1]) + "\r\n\r\n****************\r\n\r\n"



    ##################################
    #### Handle all relationships ####
    ##################################

    data, final_data = clear_upload_dicts(data, final_data) #Clear dicts

    crits_str_result += "\r\n\r\n****************crits_upload_dict*****************\r\n\r\n" + str(crits_upload_dict)

    relation_res = relate_objects(crits_upload_dict, last_sample)

    crits_str_result += "\r\n\r\n****************relation_res*****************\r\n\r\n" + str(relation_res)

    return crits_str_result

Example 66

Project: poio-api Source File: toolbox.py
    def _build_annotations(self):
        """
        Helper method to parse the input file and store intermediate information
        in attributes.

        """
 
        elements = dict()
        ids = dict()

        current_record_id = 0
        current_utterance_id = 0
        current_id = 0

        first_marker_found = False
        tier_marker = None

        current_utterance = None

        # Go through lines in the input file
        for line_number, line in enumerate(self.input_stream):
            # remove BOM
            if line_number == 0:

                if line.startswith(codecs.BOM_UTF8):
                    line = line[BOMLEN:]
                    
            line = line.decode("utf-8", 'ignore')
            line = line.strip()

            if "\name" in line:
                self.meta_information = line.split(None,2)[2]

            if line == "":
                if len(elements) > 0:
                    self._process_record(elements, ids, current_utterance_id)
                    elements = dict()
                    ids = dict()
                continue

            # parse line
            last_tier_marker = tier_marker
            tier_marker = None
            line_content = None
            match_tier_marker = re_tier_marker.search(line)
            if match_tier_marker:
                tier_marker = match_tier_marker.group(1)
                line_content = re_tier_marker.sub("", line)
                line_content = line_content.lstrip()
            elif first_marker_found:
                # append to last annotation´s content
                id_to_add = current_record_id
                if last_tier_marker in self.utterance_level_markers:
                    id_to_add = current_utterance_id

                if self._tier_labels.tier_label_exists(last_tier_marker):
                    self._annotations_for_parent[
                        ("a{0}".format(id_to_add),
                            last_tier_marker)][-1].value += " " + \
                            line

                tier_marker = last_tier_marker
                continue

            # skip all lines before first record marker
            if not first_marker_found:
                if tier_marker != self.record_marker:
                    continue
                else:
                    first_marker_found = True

            if tier_marker in self.word_level_markers:
                # Is it a new utterance? Then create a new ID.
                if current_utterance is None:
                    current_utterance = ""

                if current_utterance == "":
                    current_utterance_id = current_id
                    current_id += 1

                current_utterance += re.sub("\s+", " ", line_content) + " "

            if tier_marker in self.word_level_markers or \
                    tier_marker in self.morpheme_level_markers or \
                    tier_marker in self.tag_level_markers:

                if tier_marker not in elements:
                    elements[tier_marker] = dict()
                if tier_marker not in ids:
                    ids[tier_marker] = dict()

                for j, match in enumerate(re_word.finditer(line)):
                    pos = char_len(line[:match.start(1)])
                    elements[tier_marker][pos] = match.group(1)
                    ids[tier_marker][pos] = "a{0}".format(current_id)
                    current_id += 1

            # utterance level markers
            elif tier_marker in self.utterance_level_markers:

                # we left the utterance tiers, so create an utterance
                # annotation based on the content and make it the current
                # utterance
                if current_utterance is not None and current_utterance != "":
                    current_utterance = current_utterance.rstrip()
                    self._annotate_utterance(current_record_id,
                                             current_utterance_id,
                                             current_utterance)

                    current_utterance = ""

                elif current_utterance is None:
                    current_utterance_id = current_id
                    current_id += 1
                    self._annotate_utterance(current_record_id,
                                             current_utterance_id, "")

                    current_utterance = ""

                # add the annotation to the current utterance
                self._annotations_for_parent[
                    ("a{0}".format(current_utterance_id), tier_marker)].append(
                        poioapi.io.graf.Annotation(
                            "a{0}".format(current_id), line_content))
                current_id += 1

            # record level markers
            elif tier_marker in self.record_level_markers:

                if tier_marker == self.record_marker:
                    # this is to ensure that the utterance get annotated even
                    # if there were no other utterance_level_markers to cause
                    # it to be
                    if current_utterance is not None and current_utterance != '':
                        self._annotate_utterance(current_record_id,
                                                 current_utterance_id,
                                                 current_utterance)

                    self._annotations_for_parent[
                        (None, tier_marker)].append(
                            poioapi.io.graf.Annotation(
                                "a{0}".format(current_id), line_content))
                    current_record_id = current_id
                    current_id += 1
                    current_utterance = None

                else:
                    # this is to ensure that the utterance get annotated even
                    # if there were no other utterance_level_markers to cause
                    # it to be
                    if current_utterance is not None and current_utterance != '':
                        self._annotate_utterance(current_record_id,
                                                 current_utterance_id,
                                                 current_utterance)

                    self._annotations_for_parent[
                        ("a{0}".format(current_record_id), tier_marker)].append(
                            poioapi.io.graf.Annotation(
                                "a{0}".format(current_id), line_content))

                    current_id += 1

        self.input_stream.seek(0)

Example 67

Project: LTLMoP Source File: specCompiler.py
    def _writeLTLFile(self):

        self.LTL2SpecLineNumber = None

        #regionList = [r.name for r in self.parser.proj.rfi.regions]
        regionList = [r.name for r in self.proj.rfi.regions]
        sensorList = deepcopy(self.proj.enabled_sensors)
        robotPropList = self.proj.enabled_actuators + self.proj.all_customs

        text = self.proj.specText

        response = None

        # Create LTL using selected parser
        # TODO: rename decomposition object to something other than 'parser'
        if self.proj.compile_options["parser"] == "slurp":
            # default to no region tags if no simconfig is defined, so we can compile without
            if self.proj.current_config == "":
                region_tags = {}
            else:
                self.hsub = handlerSubsystem.HandlerSubsystem(None, self.proj.project_root)
                config, success = self.hsub.loadConfigFile(self.proj.current_config)
                if success: self.hsub.configs.append(config)
                self.hsub.setExecutingConfig(self.proj.current_config)

                region_tags = self.hsub.executing_config.region_tags

            # Hack: We need to make sure there's only one of these
            global _SLURP_SPEC_GENERATOR

            # Make a new specgenerator and have it process the text
            if not _SLURP_SPEC_GENERATOR:
                # Add SLURP to path for import
                p = os.path.dirname(os.path.abspath(__file__))
                sys.path.append(os.path.join(p, "..", "etc", "SLURP"))
                from ltlbroom.specgeneration import SpecGenerator
                _SLURP_SPEC_GENERATOR = SpecGenerator()

            # Filter out regions it shouldn't know about
            filtered_regions = [region.name for region in self.proj.rfi.regions
                                if not (region.isObstacle or region.name.lower() == "boundary")]
            LTLspec_env, LTLspec_sys, self.proj.internal_props, internal_sensors, results, responses, traceback = \
                _SLURP_SPEC_GENERATOR.generate(text, sensorList, filtered_regions, robotPropList, region_tags)

            oldspec_env = LTLspec_env
            oldspec_sys = LTLspec_sys

            for ln, result in enumerate(results):
                if not result:
                    logging.warning("Could not parse the sentence in line {0}".format(ln))

            # Abort compilation if there were any errors
            if not all(results):
                return None, None, responses

            # Add in the sensors so they go into the SMV and spec files
            for s in internal_sensors:
                if s not in sensorList:
                    sensorList.append(s)
                    self.proj.all_sensors.append(s)
                    self.proj.enabled_sensors.append(s)

            # Conjoin all the spec chunks
            LTLspec_env = '\t\t' + ' & \n\t\t'.join(LTLspec_env)
            LTLspec_sys = '\t\t' + ' & \n\t\t'.join(LTLspec_sys)

            if self.proj.compile_options["decompose"]:
                # substitute decomposed region names
                for r in self.proj.rfi.regions:
                    if not (r.isObstacle or r.name.lower() == "boundary"):
                        LTLspec_env = re.sub('\\bs\.' + r.name + '\\b', "("+' | '.join(["s."+x for x in self.parser.proj.regionMapping[r.name]])+")", LTLspec_env)
                        LTLspec_env = re.sub('\\be\.' + r.name + '\\b', "("+' | '.join(["e."+x for x in self.parser.proj.regionMapping[r.name]])+")", LTLspec_env)
                        LTLspec_sys = re.sub('\\bs\.' + r.name + '\\b', "("+' | '.join(["s."+x for x in self.parser.proj.regionMapping[r.name]])+")", LTLspec_sys)
                        LTLspec_sys = re.sub('\\be\.' + r.name + '\\b', "("+' | '.join(["e."+x for x in self.parser.proj.regionMapping[r.name]])+")", LTLspec_sys)

            response = responses

        elif self.proj.compile_options["parser"] == "ltl":
            # delete comments
            text = re.sub(r"#.*$", "", text, flags=re.MULTILINE)

            # split into env and sys parts (by looking for a line of just dashes in between)
            LTLspec_env, LTLspec_sys = re.split(r"^\s*-+\s*$", text, maxsplit=1, flags=re.MULTILINE)

            # split into subformulas
            LTLspec_env = re.split(r"(?:[ \t]*[\n\r][ \t]*)+", LTLspec_env)
            LTLspec_sys = re.split(r"(?:[ \t]*[\n\r][ \t]*)+", LTLspec_sys)

            # remove any empty initial entries (HACK?)
            while '' in LTLspec_env:
                LTLspec_env.remove('')
            while '' in LTLspec_sys:
                LTLspec_sys.remove('')

            # automatically conjoin all the subformulas
            LTLspec_env = '\t\t' + ' & \n\t\t'.join(LTLspec_env)
            LTLspec_sys = '\t\t' + ' & \n\t\t'.join(LTLspec_sys)

            if self.proj.compile_options["decompose"]:
                # substitute decomposed region
                for r in self.proj.rfi.regions:
                    if not (r.isObstacle or r.name.lower() == "boundary"):
                        LTLspec_env = re.sub('\\b(?:s\.)?' + r.name + '\\b', "("+' | '.join(["s."+x for x in self.parser.proj.regionMapping[r.name]])+")", LTLspec_env)
                        LTLspec_sys = re.sub('\\b(?:s\.)?' + r.name + '\\b', "("+' | '.join(["s."+x for x in self.parser.proj.regionMapping[r.name]])+")", LTLspec_sys)
            else:
                for r in self.proj.rfi.regions:
                    if not (r.isObstacle or r.name.lower() == "boundary"):
                        LTLspec_env = re.sub('\\b(?:s\.)?' + r.name + '\\b', "s."+r.name, LTLspec_env)
                        LTLspec_sys = re.sub('\\b(?:s\.)?' + r.name + '\\b', "s."+r.name, LTLspec_sys)

            traceback = [] # HACK: needs to be something other than None
        elif self.proj.compile_options["parser"] == "structured":
            import parseEnglishToLTL

            if self.proj.compile_options["decompose"]:
                # substitute the regions name in specs
                for m in re.finditer(r'near (?P<rA>\w+)', text):
                    text=re.sub(r'near (?P<rA>\w+)', "("+' or '.join(["s."+r for r in self.parser.proj.regionMapping['near$'+m.group('rA')+'$'+str(50)]])+")", text)
                for m in re.finditer(r'within (?P<dist>\d+) (from|of) (?P<rA>\w+)', text):
                    text=re.sub(r'within ' + m.group('dist')+' (from|of) '+ m.group('rA'), "("+' or '.join(["s."+r for r in self.parser.proj.regionMapping['near$'+m.group('rA')+'$'+m.group('dist')]])+")", text)
                for m in re.finditer(r'between (?P<rA>\w+) and (?P<rB>\w+)', text):
                    text=re.sub(r'between ' + m.group('rA')+' and '+ m.group('rB'),"("+' or '.join(["s."+r for r in self.parser.proj.regionMapping['between$'+m.group('rA')+'$and$'+m.group('rB')+"$"]])+")", text)

                # substitute decomposed region
                for r in self.proj.rfi.regions:
                    if not (r.isObstacle or r.name.lower() == "boundary"):
                        text = re.sub('\\b' + r.name + '\\b', "("+' | '.join(["s."+x for x in self.parser.proj.regionMapping[r.name]])+")", text)

                regionList = ["s."+x.name for x in self.parser.proj.rfi.regions]
            else:
                for r in self.proj.rfi.regions:
                    if not (r.isObstacle or r.name.lower() == "boundary"):
                        text = re.sub('\\b' + r.name + '\\b', "s."+r.name, text)

                regionList = ["s."+x.name for x in self.proj.rfi.regions]

            spec, traceback, failed, self.LTL2SpecLineNumber, self.proj.internal_props = parseEnglishToLTL.writeSpec(text, sensorList, regionList, robotPropList)

            # Abort compilation if there were any errors
            if failed:
                return None, None, None

            LTLspec_env = spec["EnvInit"] + spec["EnvTrans"] + spec["EnvGoals"]
            LTLspec_sys = spec["SysInit"] + spec["SysTrans"] + spec["SysGoals"]

        else:
            logging.error("Parser type '{0}' not currently supported".format(self.proj.compile_options["parser"]))
            return None, None, None

        if self.proj.compile_options["decompose"]:
            regionList = [x.name for x in self.parser.proj.rfi.regions]
        else:
            regionList = [x.name for x in self.proj.rfi.regions]

        if self.proj.compile_options["use_region_bit_encoding"]:
            # Define the number of bits needed to encode the regions
            numBits = int(math.ceil(math.log(len(regionList),2)))

            # creating the region bit encoding
            bitEncode = bitEncoding(len(regionList),numBits)
            currBitEnc = bitEncode['current']
            nextBitEnc = bitEncode['next']

            # switch to bit encodings for regions
            LTLspec_env = replaceRegionName(LTLspec_env, bitEncode, regionList)
            LTLspec_sys = replaceRegionName(LTLspec_sys, bitEncode, regionList)

            if self.LTL2SpecLineNumber is not None:
                for k in self.LTL2SpecLineNumber.keys():
                    new_k = replaceRegionName(k, bitEncode, regionList)
                    if new_k != k:
                        self.LTL2SpecLineNumber[new_k] = self.LTL2SpecLineNumber[k]
                        del self.LTL2SpecLineNumber[k]

        if self.proj.compile_options["decompose"]:
            adjData = self.parser.proj.rfi.transitions
        else:
            adjData = self.proj.rfi.transitions

        # Store some data needed for later analysis
        self.spec = {}
        if self.proj.compile_options["decompose"]:
            self.spec['Topo'] = createTopologyFragment(adjData, self.parser.proj.rfi.regions, use_bits=self.proj.compile_options["use_region_bit_encoding"])
        else:
            self.spec['Topo'] = createTopologyFragment(adjData, self.proj.rfi.regions, use_bits=self.proj.compile_options["use_region_bit_encoding"])

        # Substitute any macros that the parsers passed us
        LTLspec_env = self.substituteMacros(LTLspec_env)
        LTLspec_sys = self.substituteMacros(LTLspec_sys)

        # If we are not using bit-encoding, we need to
        # explicitly encode a mutex for regions
        if not self.proj.compile_options["use_region_bit_encoding"]:
            # DNF version (extremely slow for core-finding)
            #mutex = "\n\t&\n\t []({})".format(" | ".join(["({})".format(" & ".join(["s."+r2.name if r is r2 else "!s."+r2.name for r2 in self.parser.proj.rfi.regions])) for r in self.parser.proj.rfi.regions]))

            if self.proj.compile_options["decompose"]:
                region_list = self.parser.proj.rfi.regions
            else:
                region_list = self.proj.rfi.regions

            # Almost-CNF version
            exclusions = []
            for i, r1 in enumerate(region_list):
                for r2 in region_list[i+1:]:
                    exclusions.append("!(s.{} & s.{})".format(r1.name, r2.name))
            mutex = "\n&\n\t []({})".format(" & ".join(exclusions))
            LTLspec_sys += mutex

        self.spec.update(self.splitSpecIntoComponents(LTLspec_env, LTLspec_sys))

        # Add in a fragment to make sure that we start in a valid region
        if self.proj.compile_options["decompose"]:
            self.spec['InitRegionSanityCheck'] = createInitialRegionFragment(self.parser.proj.rfi.regions, use_bits=self.proj.compile_options["use_region_bit_encoding"])
        else:
            self.spec['InitRegionSanityCheck'] = createInitialRegionFragment(self.proj.rfi.regions, use_bits=self.proj.compile_options["use_region_bit_encoding"])
        LTLspec_sys += "\n&\n" + self.spec['InitRegionSanityCheck']

        LTLspec_sys += "\n&\n" + self.spec['Topo']

        createLTLfile(self.proj.getFilenamePrefix(), LTLspec_env, LTLspec_sys)

        if self.proj.compile_options["parser"] == "slurp":
            self.reversemapping = {self.postprocessLTL(line,sensorList,robotPropList).strip():line.strip() for line in oldspec_env + oldspec_sys}
            self.reversemapping[self.spec['Topo'].replace("\n","").replace("\t","").lstrip().rstrip("\n\t &")] = "TOPOLOGY"

        #for k,v in self.reversemapping.iteritems():
        #    print "{!r}:{!r}".format(k,v)

        return self.spec, traceback, response

Example 68

Project: PipelineConstructionSet Source File: expressions.py
def expressionAsCSharpFunction(expressionNode, conflictDict, refTypeDict, jointOrientVars):
	"""Return the expression node's contents as C# for Unity
	@param expressionNode the expression node to translate into C#
	@param conflictDict a dictionary mapping Maya reference names to conflict-resolved alternatives
	@param refTypeDict a dictionary storing the Unity type for each reference in the list
	@param jointOrientVars a dictionary mapping joint names to their prerotation Quaternion variables"""
	# get the expression's contents
	expression = cmds.expression(expressionNode, q=True, s=True)
		
	# remove comments
	expression = re.sub('//.*?[\n\r]', '', expression)
	for comment in re.findall('/\*.*?\*/', expression, re.DOTALL):
		expression = expression.replace(comment, '')
	
	# find types of reference fields and variables
	referenceFieldDict = getReferenceFieldDict(expressionNode)
	variableTypeDict = getVariableTypeDict(expression, referenceFieldDict)
	
	# append variable name conflicts to conflictDict
	for v in variableTypeDict:
		if not v[1:] in kReservedKeywords and not v[1:] in conflictDict: conflictDict[v] = v
		else: conflictDict[v] = '$__varNameConflict__%s'%v[1:]
	
	# fix naming conflicts
	for conflict in conflictDict:
		if re.match('%s(?=\W)'%re.escape(conflict), expression):
			expression = re.sub('\A%s(?=\W)'%re.escape(conflict), conflictDict[conflict], expression, 1)
		expression = re.sub('(?<=\W)%s(?=\W)'%re.escape(conflict), conflictDict[conflict], expression)
	
	# attempt to make on-demand math commands Unity-friendly
	expression = convertMathCommands(expression)
	
	# make print statements Unity-friendly
	skip = 0
	while len(re.findall('print\s+.*?;', expression)) > skip:
		printStatement = list(re.finditer('print\s+.*?;', expression))[skip]
		if printStatement.start(0) == 0 or not re.match('[\w\d.$]', expression[printStatement.start(0)-1]):
			expression = (expression[0:printStatement.start(0)] + 
				'Debug.Log(%s);'%re.match('.*?(?=;)', printStatement.group(0)[len('print'):].lstrip()).group(0) + 
				expression[printStatement.end(0):])
		else: skip += 1
	
	# reformat lines so there is one line of code per line of text
	lines = expression.split(';')
	expression = ''
	for line in lines:
		if line == '': continue
		if line[-1] == '}': line = '\n%s'%re.sub('\n', ' ', line.lstrip().rstrip())
		else: line = '\n%s;'%re.sub('\n', ' ', line.lstrip().rstrip())
		expression += re.sub('  ', ' ', line)
	expression = re.sub(';\n}', '; }\n', expression.lstrip())
		
	# remove lines that have only a declaration
	lines = expression.split(';')
	expression = ''
	for line in lines:
		if line == '': continue
		if re.search('\w+\s+\$\w+', line):
			if not '=' in line: continue
			else: line = re.sub('\w+\s+(?=\$\w)', '', line)
		if line[-1] == '}' or line[-1] == '\n': expression += line
		else: expression += '%s;'%line
	
	# split multi-assignments into separate lines
	lines = expression.split(';')
	for line in lines:
		newLines = ['%s;'%line]
		assignments = re.split('(?<![=<>])=(?!=)', line)
		finalValue = assignments[len(assignments)-1]
		if len(assignments) > 2:
			newLines = []
			for i in range(len(assignments)-1):
				if i == 0: newLines.append('%s = %s;'%(assignments[i].rstrip(), finalValue.lstrip()))
				else: newLines.append('%s = %s;'%(assignments[i].rstrip(), re.match('[\s\w.]+', assignments[0][::-1]).group(0)[::-1].rstrip().lstrip()))
		replacementLine = newLines[0]
		for i in range(len(newLines)):
			if i == 0: continue
			replacementLine = '%s\n%s'%(replacementLine, newLines[i])
		expression = expression.replace('%s;'%line, replacementLine)
	
	# reformat operators
	for op in ['=', '==', '<', '<=', '>', '>=', '!=', '+', '+=', '-=', '*', '*=', '/', '/=', '?', ':']:
		expression = re.sub('(?<=[\w\d\s()])%s(?=[\w\d\s()$])'%re.escape(op), ' %s '%op, expression)
		expression = re.sub('(?<=[\w\d()])\s+%s\s+(?=[\w\d()$])'%re.escape(op), ' %s '%op, expression)
	for op in ['-']:
		expression = re.sub('\s+%s\s+'%re.escape(op), op, expression)
		expression = re.sub('(?<=[\w\d()])%s(?=[\w\d()$])'%re.escape(op), ' %s '%op, expression)
		expression = re.sub('(?<=[\w\d()])%s%s(?=[\w\d()$])'%(re.escape(op),re.escape(op)), '%s %s'%(op,op), expression)
		expression = re.sub('(?<=[\w\d)])%s\s'%(re.escape(op)), ' %s '%op, expression)
	for op in ['++', '--']: # TODO: this hasn't been thoroughly examined
		expression = re.sub('\s+%s'%re.escape(op), '%s'%op, expression)
	
	# reformat one-line if-statements
	lines = expression.split('\n')
	expression = ''
	for line in lines:
		line = line.lstrip().rstrip()
		if (re.match('if\W', line) or re.match('else if\W', line)) and not re.search('{', line):
			openCt = 0
			closeCt = 0
			endBlock = 0
			for i in xrange(len(line)):
				if endBlock > 0: continue
				if line[i] == '(': openCt += 1
				elif line[i] == ')':
					closeCt += 1
					if openCt > 0 and openCt == closeCt: endBlock = i
			if endBlock > 0: line = '%s\n{%s;}'%(line[:endBlock+1], line[endBlock+1:-1])
		elif re.match('else\W', line) and not re.search('{', line):
			line = 'else {%s}'%line[5:]
		expression += '%s\n'%line
	
	# reformat tabbing
	expression = re.sub('{', '\n{\n', expression)
	expression = re.sub('}', '\n}\n', expression)
	lines = expression.split('\n')
	expression = ''
	indent = 0
	for line in lines:
		line = line.lstrip().rstrip()
		line = re.sub('\s+', ' ', line)
		if line == '': continue
		bracketOpen = len(re.findall('{', line))
		bracketClose = len(re.findall('}', line))
		if bracketOpen<bracketClose: indent -= 1
		expression += '%s%s\n'%('\t'*indent, line)
		if bracketOpen>bracketClose: indent += 1
	expression = expression.rstrip()
	
	# consolidate literal blocks
	expression = consolidateLiteralBlocks(expression, conflictDict, referenceFieldDict, variableTypeDict)
	
	# correct assignments
	expression = correctAssignments(expression, conflictDict, refTypeDict, referenceFieldDict, variableTypeDict, jointOrientVars)
	
	# replace attributeNames with names of Unity fields as needed
	expression = correctReferenceFieldSyntax(expression, conflictDict, refTypeDict)
	
	# clean up spacing
	for op in ['= =', '< =', '> =', '! =', '+ =', '- =', '* =', '/ =']:
		expression = re.sub(re.escape(op), op.replace(' ', ''), expression)
	matches = list(re.finditer('(?<=\()\-\s(?=\d)', expression))
	for match in matches: expression = re.sub(re.escape(match.group(0)), '-', expression)
	
	# convert addition and subtraction of negatives
	expression = re.sub('\+\s\-(?=\S)', '- ', expression)
	expression = re.sub('\+\s\(\-\(', '- ((', expression)
	expression = re.sub('\-\s\-(?=\S)', '+ ', expression)
	expression = re.sub('\-\s\(\-\(', '+ ((', expression)
	
	# correct float literal syntax
	expression = correctFloatLiteralSyntax(expression)
	
	# comment out lines with broken references e.g., .I[1] or .O[3]
	lines = expression.split('\n')
	expression = ''
	for line in lines:
		if line == '': continue
		if re.search('\s\.\w', line) or re.match('\.\w', line): line = '//%s // ERROR: Missing object reference. Check your Maya file.'%line
		expression += '%s\n'%line
	expression = expression.rstrip()
	
	# put all variable declarations up front
	declarations = ''
	for var in variableTypeDict:
		declarations += '\n%s %s;'%(variableTypeDict[var], conflictDict[var])
	expression = (declarations.lstrip()+'\n\n'+expression.lstrip()).lstrip()
	
	# remove variable symbols
	expression = expression.replace('$', '')
	
	#print expression
	return expression

Example 69

Project: nrvr-commander Source File: ssh.py
    def __init__(self,
                 fromPath, toPath,
                 fromSshParameters=None, toSshParameters=None,
                 recurseDirectories=False,
                 preserveTimes=True):
        """Create new ScpCommand instance.
        
        Will wait until completed.
        
        Captures returncode, and output.
        
        Either fromPath or toPath is expected to be local, i.e. without user and without IP address.
        Correspondingly either fromSshParameters or toSshParameters must NOT be assigned an SshParameters
        instance and remain default None.
        
        fromPath
            one path or a list of paths.
            
            Absolute paths strongly recommended.
        
        toPath
            one path.
            
            Absolute path strongly recommended.
            
            Must be directory if more than one fromPath.
        
        fromSshParameters
            an SshParameters instance.
        
        toSshParameters
            an SshParameters instance.
        
        recurseDirectories
            a hint for when fromSshParameters."""
        if not _gotPty:
            # cannot use scp if no pty
            raise Exception("must have module pty available to use scp command"
                            ", which is known to be available in Python 2.6 on Linux, but not on Windows")
        #
        if fromSshParameters and toSshParameters:
            raise Exception("cannot copy if both fromSshParameters and toSshParameters, only one or other")
        if not fromSshParameters and not toSshParameters:
            raise Exception("cannot copy if neither fromSshParameters nor toSshParameters, requires one or other")
        #
        if not isinstance(fromPath, (list, tuple)): # should be one string for one path to copy from
            fromPaths = [fromPath]
        else: # should be a list of strings for multiple paths to copy from
            fromPaths = fromPath
        if len(fromPaths) == 0:
            raise Exception("cannot copy zero files, requires at least one")
        if fromSshParameters: # get files from remote
            if len(fromPaths) > 1 or recurseDirectories:
                if not os.path.isdir(toPath):
                    raise Exception("cannot copy multiple files into a file, must copy into a directory, not into %s" % toPath)
            self._fromSpecification = \
                [fromSshParameters.user + "@" + IPAddress.asString(fromSshParameters.ipaddress) + ":" + " ".join(fromPaths)]
            self._toSpecification = toPath
            self._ipaddress = fromSshParameters.ipaddress
            self._pwd = fromSshParameters.pwd
        else: # put files to remote
            anyFromDirectory = False
            for path in fromPaths:
                if os.path.isdir(path):
                    anyFromDirectory = True
                    break
            if anyFromDirectory:
                recurseDirectories = True # mandatory in this case
            self._fromSpecification = fromPaths
            self._toSpecification = \
                toSshParameters.user + "@" + IPAddress.asString(toSshParameters.ipaddress) + ":" + toPath
            self._ipaddress = toSshParameters.ipaddress
            self._pwd = toSshParameters.pwd
        self._args = ["scp"]
        if preserveTimes:
            self._args.append("-p")
        if recurseDirectories:
            self._args.append("-r")
        self._args.extend(self._fromSpecification) # a list because possibly more than one
        self._args.append(self._toSpecification)
        #
        self._output = ""
        self._returncode = None
        #
        # fork and connect child to a pseudo-terminal
        self._pid, self._fd = pty.fork()
        if self._pid == 0:
            # in child process
            os.execvp("scp", self._args)
        else:
            # in parent process
            if self._pwd:
                # if given a password then apply
                promptedForPassword = False
                outputTillPrompt = ""
                # look for password prompt
                while not promptedForPassword:
                    try:
                        newOutput = os.read(self._fd, 1024)
                        if not len(newOutput):
                            # end has been reached
                            # was raise Exception("unexpected end of output from scp")
                            raise Exception("failing to connect for scp\n" + 
                                            outputTillPrompt)
                        # ssh has been observed returning "\r\n" for newline, but we want "\n"
                        newOutput = SshCommand._crLfRegex.sub("\n", newOutput)
                        outputTillPrompt += newOutput
                        if SshCommand._acceptPromptRegex.search(outputTillPrompt):
                            # e.g. "Are you sure you want to continue connecting (yes/no)? "
                            raise Exception("cannot proceed unless having accepted host key\n" +
                                            outputTillPrompt +
                                            '\nE.g. invoke SshCommand.acceptKnownHostKey(SshParameters("{0}",user,pwd)).'.format(self._ipaddress))
                        if SshCommand._pwdPromptRegex.search(outputTillPrompt):
                            # e.g. "10.123.45.67's password: "
                            promptedForPassword = True
                    except EnvironmentError:
                        # e.g. "@    WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!     @" and closing
                        raise Exception("failing to connect for scp\n" + 
                                        outputTillPrompt)
                os.write(self._fd, self._pwd + "\n")
            # look for output
            endOfOutput = False
            outputSincePrompt = ""
            try:
                while not endOfOutput:
                    try:
                        newOutput = os.read(self._fd, 1024)
                        if len(newOutput):
                            outputSincePrompt += newOutput
                        else:
                            # end has been reached
                            endOfOutput = True
                    except EnvironmentError as e:
                        # some ideas maybe at http://bugs.python.org/issue5380
                        if e.errno == 5: # errno.EIO:
                            # seen when pty closes OSError: [Errno 5] Input/output error
                            endOfOutput = True
                        else:
                            # we accept what we got so far, for now
                            endOfOutput = True
            finally:
                # remove any leading space (maybe there after "password:" prompt) and
                # remove first newline (is there after entering password and "\n")
                self._output = re.sub(r"^\s*?\n(.*)$", r"\1", outputSincePrompt)
                #
                # get returncode
                try:
                    ignorePidAgain, waitEncodedStatusIndication = os.waitpid(self._pid, 0)
                    if os.WIFEXITED(waitEncodedStatusIndication):
                        # normal exit(status) call
                        self._returncode = os.WEXITSTATUS(waitEncodedStatusIndication)
                        # raise an exception if there is a reason
                        exceptionMessage = ""
                        if self._returncode:
                            exceptionMessage += "returncode: " + str(self._returncode)
                        if exceptionMessage:
                            commandDescription = "scp from:\n\t" + str(self._fromSpecification)
                            commandDescription += "\nto:\n\t" + self._toSpecification
                            commandDescription += "\nargs:\n\t" + str(self._args)
                            exceptionMessage = commandDescription + "\n" + exceptionMessage
                            exceptionMessage += "\noutput:\n" + self._output
                            raise ScpCommandException(exceptionMessage)
                    else:
                        # e.g. os.WIFSIGNALED or os.WIFSTOPPED
                        self._returncode = -1
                        raise ScpCommandException("scp did not exit normally")
                except OSError:
                    # supposedly can occur
                    self._returncode = -1
                    raise ScpCommandException("scp did not exit normally")

Example 70

Project: gitlab-to-atlassian Source File: dump_gitlab_json.py
def main(argv=None):
    '''
    Process the command line arguments and create the JSON dump.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Export all users/issues from GitLab to JIRA JSON format.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve')
    parser.add_argument('gitlab_url',
                        help='The full URL to your GitLab instance.')
    parser.add_argument('-d', '--date_filter',
                        help='Only include issues, notes, etc. created after\
                              the specified date. Expected format is \
                              YYYY-MM-DD',
                        type=get_datetime, default='1970-01-01')
    parser.add_argument('-e', '--include_empty',
                        help='Include projects in output that do not have any\
                              issues.',
                        action='store_true')
    parser.add_argument('-i', '--ignore_list',
                        help='List of project names to exclude from dump.',
                        type=argparse.FileType('r'))
    parser.add_argument('-p', '--password',
                        help='The password to use to authenticate if token is \
                              not specified. If password and token are both \
                              unspecified, you will be prompted to enter a \
                              password.')
    parser.add_argument('-P', '--page_size',
                        help='When retrieving result from GitLab, how many \
                              results should be included in a given page?.',
                        type=int, default=20)
    parser.add_argument('-s', '--verify_ssl',
                        help='Enable SSL certificate verification',
                        action='store_true')
    parser.add_argument('-t', '--token',
                        help='The private GitLab API token to use for \
                              authentication. Either this or username and \
                              password must be set.')
    parser.add_argument('-u', '--username',
                        help='The username to use for authentication, if token\
                              is unspecified.')
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                             'additional time this flag is specified, ' +
                             'output gets more verbose.',
                        default=0, action='count')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    args.page_size = max(100, args.page_size)

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)

    # Setup authenticated GitLab instance
    if args.token:
        git = GitLab(args.gitlab_url, token=args.token,
                     verify_ssl=args.verify_ssl)
    else:
        if not args.username:
            print('Username: ', end="", file=sys.stderr)
            args.username = input('').strip()
        if not args.password:
            args.password = getpass.getpass('Password: ')
        git = GitLab(args.gitlab_url, verify_ssl=args.verify_ssl)
        git.login(args.username, args.password)

    # Initialize output dictionary
    output_dict = defaultdict(list)

    print('Creating project entries...', end="", file=sys.stderr)
    sys.stderr.flush()
    key_set = set()
    mentioned_users = set()
    if args.ignore_list is not None:
        ignore_list = {line.strip().lower() for line in args.ignore_list}
    else:
        ignore_list = {}
    for project in gen_all_results(git.getprojectsall,
                                   per_page=args.page_size):
        proj_name_lower = project['name'].lower()
        if proj_name_lower not in ignore_list and project['issues_enabled']:
            project_issues = []
            for issue in gen_all_results(git.getprojectissues, project['id'],
                                         per_page=args.page_size):
                if args.date_filter < parsedate(issue['updated_at']).replace(tzinfo=None):
                    project_issues.append(issue)
                else:
                    for note in git.getissuewallnotes(project['id'],
                                                      issue['id']):
                        if args.date_filter < parsedate(issue['updated_at']).replace(tzinfo=None):
                            project_issues.append(issue)
                            break

            if project_issues or args.include_empty:
                jira_project = {}
                jira_project['name'] = project['name_with_namespace']
                key = project['name']
                if key.islower():
                    key = key.title()
                key = re.sub(r'[^A-Z]', '', key)
                if len(key) < 2:
                    key = re.sub(r'[^A-Za-z]', '',
                                 project['name'])[0:2].upper()
                added = False
                suffix = 65
                while key in key_set:
                    if not added:
                        key += 'A'
                    else:
                        suffix += 1
                        key = key[:-1] + chr(suffix)
                key_set.add(key)
                jira_project['key'] = key
                jira_project['description'] = md_to_wiki(project['description'])
                # jira_project['created'] = project['created_at']
                jira_project['issues'] = []
                for issue in project_issues:
                    jira_issue = {}
                    jira_issue['externalId'] = issue['iid']
                    if issue['state'] == 'closed':
                        jira_issue['status'] = 'Closed'
                        jira_issue['resolution'] = 'Resolved'
                    else:
                        jira_issue['status'] = 'Open'

                    jira_issue['description'] = md_to_wiki(issue['description'])
                    jira_issue['reporter'] = issue['author']['username']
                    mentioned_users.add(jira_issue['reporter'])
                    jira_issue['labels'] = issue['labels']
                    jira_issue['summary'] = issue['title']
                    if issue['assignee']:
                        jira_issue['assignee'] = issue['assignee']['username']
                        mentioned_users.add(jira_issue['assignee'])
                    jira_issue['issueType'] = 'Bug'
                    jira_issue['comments'] = []
                    # Get all comments/notes
                    for note in git.getissuewallnotes(project['id'],
                                                      issue['id']):
                        jira_note = {}
                        jira_note['body'] = md_to_wiki(note['body'])
                        jira_note['author'] = note['author']['username']
                        mentioned_users.add(jira_note['author'])
                        jira_note['created'] = note['created_at']
                        jira_issue['comments'].append(jira_note)
                    jira_project['issues'].append(jira_issue)

                output_dict['projects'].append(jira_project)
        print('.', end="", file=sys.stderr)
        sys.stderr.flush()

    print('\nCreating user entries...', end="", file=sys.stderr)
    sys.stderr.flush()
    for user in gen_all_results(git.getusers, per_page=args.page_size):
        # Only add users who are actually referenced in issues
        if user['username'] in mentioned_users:
            jira_user = {}
            jira_user['name'] = user['username']
            jira_user['fullname'] = user['name']
            jira_user['email'] = user['email']
            jira_user['groups'] = ['gitlab-users']
            jira_user['active'] = (user['state'] == 'active')
            output_dict['users'].append(jira_user)
        print('.', end="", file=sys.stderr)
        sys.stderr.flush()

    print('\nPrinting JSON output...', file=sys.stderr)
    sys.stderr.flush()
    print(json.dumps(output_dict, indent=4))

Example 71

Project: webrecorder Source File: usercontroller.py
    def init_routes(self):

        @self.app.get(['/api/v1/dashboard', '/api/v1/dashboard/'])
        @self.manager.admin_view()
        def api_dashboard():
            cache_key = self.cache_template.format('dashboard')
            expiry = 5 * 60  # 5 min

            cache = self.manager.redis.get(cache_key)

            if cache:
                return json.loads(cache.decode('utf-8'))

            users = self.manager.get_users().items()
            results = []

            # add username and get collections
            for user, data in users:
                data['username'] = user
                results.append(data)

            temp = self.manager.redis.hgetall(self.temp_usage_key)
            user = self.manager.redis.hgetall(self.user_usage_key)
            temp = [(k.decode('utf-8'), int(v)) for k, v in temp.items()]
            user = [(k.decode('utf-8'), int(v)) for k, v in user.items()]

            data = {
                'users': UserSchema().load(results, many=True).data,
                'collections': self.manager.get_collections(user='*', api=True),
                'temp_usage': sorted(temp, key=itemgetter(0)),
                'user_usage': sorted(user, key=itemgetter(0)),
            }

            self.manager.redis.setex(cache_key,
                                     expiry,
                                     json.dumps(data, cls=CustomJSONEncoder))

            return data


        @self.app.get(['/api/v1/users', '/api/v1/users/'])
        @self.manager.admin_view()
        def api_users():
            """Full admin API resource of all users.
               Containing user info and public collections

               - Provides basic (1 dimension) RESTful sorting
               - TODO: Pagination
            """
            sorting = request.query.getunicode('sort', None)
            sort_key = sub(r'^-{1}?', '', sorting) if sorting is not None else None
            reverse = sorting.startswith('-') if sorting is not None else False

            def dt(d):
                return datetime.strptime(d, '%Y-%m-%d %H:%M:%S.%f')

            # sortable fields, with optional key unpacking functions
            filters = {
                'created': {'key': lambda obj: dt(obj[1]['creation_date'])},
                'email': {'key': lambda obj: obj[1]['email_addr']},
                'last_login': {'key': lambda obj: dt(obj[1]['last_login'])},
                'name': {'key': lambda obj: json.loads(obj[1]['desc'] or '{}')['name']},
                'username': {},
            }

            if sorting is not None and sort_key not in filters:
                raise HTTPError(400, 'Bad Request')

            sort_by = filters[sort_key] if sorting is not None else {}
            users = sorted(self.manager.get_users().items(),
                           **sort_by,
                           reverse=reverse)

            results = []

            # add username and get collections
            for user, data in users:
                data['username'] = user
                # add space usage
                total = self.manager.get_size_allotment(user)
                used = self.manager.get_size_usage(user)
                data['space_utilization'] = {
                    'total': total,
                    'used': used,
                    'available': total - used,
                }
                results.append(data)

            return {
                # `results` is a list so will always read as `many`
                'users': UserSchema().load(results, many=True).data
            }

        @self.app.get('/api/v1/anon_user')
        def get_anon_user():
            return {'anon_user': self.manager.get_anon_user(True)}

        @self.app.get('/api/v1/temp-users')
        @self.manager.admin_view()
        def temp_users():
            """ Resource returning active temp users
            """
            temp_users_keys = self.manager.redis.keys('u:{0}*'.format(self.temp_user_key))
            temp_users = []

            if len(temp_users_keys):
                with self.manager.redis.pipeline() as pi:
                    for user in temp_users_keys:
                        pi.hgetall(user)
                    temp_users = pi.execute()

                for idx, user in enumerate(temp_users_keys):
                    temp_users[idx][b'username'] = user

                # convert bytestrings, skip over incomplete
                temp_users = [{k.decode('utf-8'): v.decode('utf-8') for k, v in d.items()}
                              for d in temp_users
                              if b'max_size' in d and b'created_at' in d]

                for user in temp_users:
                    total = int(user['max_size'])
                    used = int(user.get('size', 0))
                    creation = datetime.fromtimestamp(int(user['created_at']))
                    removal = creation + timedelta(seconds=self.config['session.durations']['short']['total'])

                    u = re.search(r'{0}\w+'.format(self.temp_user_key),
                                  user['username']).group()
                    user['username'] = u
                    user['removal'] = removal.isoformat()
                    user['space_utilization'] = {
                        'total': total,
                        'used': used,
                        'available': total - used,
                    }

                temp_users, err = TempUserSchema().load(temp_users, many=True)
                if err:
                    return {'errors': err}

            return {'users': temp_users}

        @self.app.post('/api/v1/users/<user>/desc')
        def update_desc(user):
            """legacy, eventually move to the patch endpoint"""
            desc = request.body.read().decode('utf-8')

            self.manager.set_user_desc(user, desc)
            return {}

        @self.app.post(['/api/v1/users', '/api/v1/users/'])
        @self.manager.admin_view()
        def api_create_user():
            """API enpoint to create a user with schema validation"""
            users = self.manager.get_users()
            emails = [u[1]['email_addr'] for u in users.items()]
            data = request.json
            err = NewUserSchema().validate(data)

            if 'username' in data and data['username'] in users:
                if not err:
                    return {'errors': 'Username already exists'}
                else:
                    err.update({'username': 'Username already exists'})

            if 'email' in data and data['email'] in emails:
                if not err:
                    return {'errors': 'Email already exists'}
                else:
                    err.update({'email': 'Email already exists'})

            # validate
            if len(err):
                return {'errors': err}

            # create user
            self.manager.cork._store.users[data['username']] = {
                'role': data['role'],
                'hash': self.manager.cork._hash(data['username'],
                                                data['password']).decode('ascii'),
                'email_addr': data['email'],
                'desc': '{{"name":"{name}"}}'.format(name=data.get('name', '')),
                'creation_date': str(datetime.utcnow()),
                'last_login': str(datetime.utcnow()),
            }
            self.manager.cork._store.save_users()

            # add user account defaults
            key = self.manager.user_key.format(user=data['username'])
            now = int(time.time())

            max_size, max_coll = self.manager.redis.hmget('h:defaults',
                                                          ['max_size', 'max_coll'])
            if not max_size:
                max_size = self.manager.default_max_size

            if not max_coll:
                max_coll = self.manager.default_max_coll

            with redis.utils.pipeline(self.manager.redis) as pi:
                pi.hset(key, 'max_size', max_size)
                pi.hset(key, 'max_coll', max_coll)
                pi.hset(key, 'created_at', now)
                pi.hset(key, 'name', data.get('name', ''))
                pi.hsetnx(key, 'size', '0')

            # create initial collection
            self.manager.create_collection(
                data['username'],
                coll=self.manager.default_coll['id'],
                coll_title=self.manager.default_coll['title'],
                desc=self.manager.default_coll['desc'].format(data['username']),
                public=False,
                synthetic=True
            )

            # Check for mailing list management
            if self.manager.mailing_list:
                self.manager.add_to_mailing_list(
                    data['username'],
                    data['email'],
                    data.get('name', ''),
                )

        @self.app.get(['/api/v1/users/<username>', '/api/v1/users/<username>/'])
        @self.manager.admin_view()
        def api_get_user(username):
            """API enpoint to return user info"""
            users = self.manager.get_users()

            if username not in users:
                self._raise_error(404, 'No such user')

            user = users[username]

            # assemble space usage
            total = self.manager.get_size_allotment(username)
            used = self.manager.get_size_usage(username)
            user['space_utilization'] = {
                'total': total,
                'used': used,
                'available': total - used,
            }

            user_data, err = UserSchema(exclude=('username',)).load(user)
            colls = self.manager.get_collections(username,
                                                 include_recs=True,
                                                 api=True)

            for coll in colls:
                for rec in coll['recordings']:
                    rec['pages'] = self.manager.list_pages(username,
                                                           coll['id'],
                                                           rec['id'])

            # colls is a list so will always be `many` even if one collection
            collections, err = CollectionSchema().load(colls, many=True)
            user_data['collections'] = collections

            return {'user': user_data}

        @self.app.put(['/api/v1/users/<username>', '/api/v1/users/<username>/'])
        @self.manager.auth_view()
        def api_update_user(username):
            """API enpoint to update user info

               See `UserUpdateSchema` for available fields.

               ** bottle 0.12.9 doesn't support `PATCH` methods.. update to
                  patch once availabile.
            """
            users = self.manager.get_users()
            if username not in users:
                self._raise_error(404, 'No such user')

            # if not admin, check ownership
            if not self.manager.is_anon(username) and not self.manager.is_superuser():
                self.manager.assert_user_is_owner(username)

            user = users[username]
            try:
                json_data = json.loads(request.forms.json)
            except Exception as e:
                print(e)
                return {'errors': 'bad json data'}

            if len(json_data.keys()) == 0:
                return {'errors': 'empty payload'}

            data, err = UserUpdateSchema(only=json_data.keys()).load(json_data)

            if len(err):
                return {'errors': err}

            if 'name' in data:
                user['desc'] = '{{"name":"{name}"}}'.format(name=data.get('name', ''))

            #
            # restricted resources
            #
            if 'max_size' in data and self.manager.is_superuser():
                key = self.manager.user_key.format(user=username)
                max_size = data.get('max_size', self.manager.default_max_size)
                max_size = int(max_size) if type(max_size) is not int else max_size

                with redis.utils.pipeline(self.manager.redis) as pi:
                    pi.hset(key, 'max_size', max_size)

            if 'role' in data and self.manager.is_superuser():
                # set new role or default to base role
                user['role'] = data.get('role', 'archivist')

            #
            # return updated user data
            #
            total = self.manager.get_size_allotment(username)
            used = self.manager.get_size_usage(username)
            user['space_utilization'] = {
                'total': total,
                'used': used,
                'available': total - used,
            }

            user_data, err = UserSchema(exclude=('username',)).load(user)
            colls = self.manager.get_collections(username,
                                                 include_recs=True,
                                                 api=True)

            for coll in colls:
                for rec in coll['recordings']:
                    rec['pages'] = self.manager.list_pages(username,
                                                           coll['id'],
                                                           rec['id'])

            # colls is a list so will always be `many` even if one collection
            collections, err = CollectionSchema().load(colls, many=True)
            user_data['collections'] = collections

            return {'user': user_data}

        @self.app.delete(['/api/v1/users/<user>', '/api/v1/users/<user>/'])
        @self.manager.admin_view()
        def api_delete_user(user):
            """API enpoint to delete a user"""
            if user not in self.manager.get_users():
                self._raise_error(404, 'No such user')

            self.manager.delete_user(user)

        @self.app.get(['/<user>', '/<user>/'])
        @self.jinja2_view('user.html')
        def user_info(user):
            self.redir_host()

            if self.manager.is_anon(user):
                self.redirect('/' + user + '/temp')

            self.manager.assert_user_exists(user)

            result = {
                'user': user,
                'user_info': self.manager.get_user_info(user),
                'collections': self.manager.get_collections(user),
            }

            if not result['user_info'].get('desc'):
                result['user_info']['desc'] = self.default_user_desc.format(user)

            return result

        # User Account Settings
        @self.app.get('/<user>/_settings')
        @self.jinja2_view('account.html')
        def account_settings(user):
            self.manager.assert_user_is_owner(user)

            return {'user': user,
                    'user_info': self.manager.get_user_info(user),
                    'num_coll': self.manager.num_collections(user),
                   }

        # Delete User Account
        @self.app.post('/<user>/$delete')
        def delete_user(user):
            if self.manager.delete_user(user):
                self.flash_message('The user {0} has been permanently deleted!'.format(user), 'success')

                redir_to = '/'
                request.environ['webrec.delete_all_cookies'] = 'all'
                self.manager.cork.logout(success_redirect=redir_to, fail_redirect=redir_to)
            else:
                self.flash_message('There was an error deleting {0}'.format(coll))
                self.redirect(self.get_path(user))

        # Expiry Message
        @self.app.route('/_expire')
        def expire():
            self.flash_message('Sorry, the anonymous collection has expired due to inactivity')
            self.redirect('/')

        @self.app.post('/_reportissues')
        def report_issues():
            useragent = request.headers.get('User-Agent')

            @self.jinja2_view('email_error.html')
            def error_email(params):
                ua = UserAgent(params.get('ua'))
                if ua.browser:
                    browser = '{0} {1} {2} {3}'
                    lang = ua.language or ''
                    browser = browser.format(ua.platform, ua.browser,
                                             ua.version, lang)

                    params['browser'] = browser
                else:
                    params['browser'] = ua.string

                params['time'] = params['time'][:19]
                return params

            self.manager.report_issues(request.POST, useragent, error_email)
            return {}

        # Skip POST request recording
        @self.app.get('/_skipreq')
        def skip_req():
            url = request.query.getunicode('url')
            user = self.manager.get_curr_user()
            if not user:
                user = self.manager.get_anon_user()

            self.manager.skip_post_req(user, url)
            return {}

Example 72

Project: wikiteam Source File: uploader.py
def upload(wikis, config={}):
    headers = {'User-Agent': dumpgenerator.getUserAgent()}

    for wiki in wikis:
        print "#"*73
        print "# Uploading", wiki
        print "#"*73
        wiki = wiki.lower()
        prefix = dumpgenerator.domain2prefix(config={'api': wiki})

        wikiname = prefix.split('-')[0]
        dumps = []
        for dirname, dirnames, filenames in os.walk('.'):
            if dirname == '.':
                for f in filenames:
                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
                        dumps.append(f)
                break

        c = 0
        for dump in dumps:
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)
            if dump in uploadeddumps:
                if config['prune-directories']:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)

                        if dumphash in map(lambda x: x['md5'], item.files):
                            log(wiki, dump, 'verified')
                            rmline='rm -rf %s' % dump
                            if not os.system(rmline):
                                print 'DELETED ' + dump
                            print '%s was uploaded before, skipping...' % (dump)
                            continue
                        else:
                            print 'ERROR: The online item misses ' + dump
                            log(wiki, dump, 'missing')
                            # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue

            time.sleep(0.1)
            wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
            print wiki, wikiname, wikidate, dump

            # Does the item exist already?
            ismissingitem = not item.exists

            # Logo path
            logourl = ''

            if ismissingitem or config['update']:
                #get metadata from api.php
                #first sitename and base url
                params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                sitename = ''
                baseurl = ''
                lang = ''
                try:
                    sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                if not sitename:
                    sitename = wikiname
                if not baseurl:
                    baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
                if lang:
                    lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()

                #now copyright info from API
                params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                rightsinfourl = ''
                rightsinfotext = ''
                try:
                    rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
                    rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                raw = ''
                try:
                    f = urllib.urlopen(baseurl)
                    raw = f.read()
                    f.close()
                except:
                    pass

                #or copyright info from #footer in mainpage
                if baseurl and not rightsinfourl and not rightsinfotext:
                    rightsinfotext = ''
                    rightsinfourl = ''
                    try:
                        rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0]
                    except:
                        pass
                    try:
                        rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
                    except:
                        pass
                    if rightsinfotext and not rightsinfourl:
                        rightsinfourl = baseurl + '#footer'
                try:
                    logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
                except:
                    pass
                print logourl

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
                wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools."
                wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
                if not rightsinfourl and not rightsinfotext:
                    wikikeys.append('unknowncopyright')

                wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
                wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
                wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
            else:
                print 'Item already exists.'
                lang = 'foo'
                wikititle = 'foo'
                wikidesc = 'foo'
                wikikeys = 'foo'
                wikilicenseurl = 'foo'
                wikirights = 'foo'
                wikiurl = 'foo'

            if c == 0:
                # Item metadata
                md = {
                    'mediatype': 'web',
                    'collection': config['collection'],
                    'title': wikititle,
                    'description': wikidesc,
                    'language': lang,
                    'last-updated-date': wikidate_text,
                    'subject': '; '.join(wikikeys), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
                    'licenseurl': wikilicenseurl and urlparse.urljoin(wiki, wikilicenseurl),
                    'rights': wikirights,
                    'originalurl': wikiurl,
                }

            #Upload files and update metadata
            try:
                item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
                item.modify_metadata(md) # update
                print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
                if logourl:
                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
                    logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
                    logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
                    item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
                uploadeddumps.append(dump)
                log(wiki, dump, 'ok')
            except:
                print wiki, dump, 'error when uploading?'

            c += 1

Example 73

Project: utter-pool Source File: __init__.py
Function: linkify
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
            parse_email=False, tokenizer=HTMLSanitizer):
    """Convert URL-like strings in an HTML fragment to links.

    linkify() converts strings that look like URLs or domain names in a
    blob of text that may be an HTML fragment to links, while preserving
    (a) links already in the string, (b) urls found in attributes, and
    (c) email addresses.
    """
    text = force_unicode(text)

    if not text:
        return u''

    parser = html5lib.HTMLParser(tokenizer=tokenizer)

    forest = parser.parseFragment(text)

    def replace_nodes(tree, new_frag, node):
        new_tree = parser.parseFragment(new_frag)
        for n in new_tree.childNodes:
            # Prevent us from re-parsing links new links as existing links.
            if n.name == 'a':
                n._seen = True
            tree.insertBefore(n, node)
        tree.removeChild(node)
        # Return the number of new nodes.
        return len(new_tree.childNodes) - 1

    def strip_wrapping_parentheses(fragment):
        """Strips wrapping parentheses.

        Returns a tuple of the following format::

            (string stripped from wrapping parentheses,
             count of stripped opening parentheses,
             count of stripped closing parentheses)
        """
        opening_parentheses = closing_parentheses = 0
        # Count consecutive opening parentheses
        # at the beginning of the fragment (string).
        for char in fragment:
            if char == '(':
                opening_parentheses += 1
            else:
                break

        if opening_parentheses:
            newer_frag = ''
            # Cut the consecutive opening brackets from the fragment.
            fragment = fragment[opening_parentheses:]
            # Reverse the fragment for easier detection of parentheses
            # inside the URL.
            reverse_fragment = fragment[::-1]
            skip = False
            for char in reverse_fragment:
                # Remove the closing parentheses if it has a matching
                # opening parentheses (they are balanced).
                if (char == ')' and
                        closing_parentheses < opening_parentheses and
                        not skip):
                    closing_parentheses += 1
                    continue
                # Do not remove ')' from the URL itself.
                elif char != ')':
                    skip = True
                newer_frag += char
            fragment = newer_frag[::-1]

        return fragment, opening_parentheses, closing_parentheses

    def apply_callbacks(attrs, new):
        for cb in callbacks:
            attrs = cb(attrs, new)
            if attrs is None:
                return None
        return attrs

    def linkify_nodes(tree, parse_text=True):
        # I know this isn't Pythonic, but we're sometimes mutating
        # tree.childNodes, which ends up breaking the loop and causing us to
        # reparse code.
        children = len(tree.childNodes)
        current = 0  # A pointer to the "current" node.
        while current < children:
            node = tree.childNodes[current]
            if node.type == NODE_TEXT and parse_text:
                new_frag = _render(node)
                # Look for email addresses?
                if parse_email:
                    new_frag = re.sub(email_re, email_repl, new_frag)
                    if new_frag != _render(node):
                        adj = replace_nodes(tree, new_frag, node)
                        children += adj
                        current += adj
                        linkify_nodes(tree)
                        continue
                new_frag = re.sub(url_re, link_repl, new_frag)
                if new_frag != _render(node):
                    adj = replace_nodes(tree, new_frag, node)
                    children += adj
                    current += adj
            elif node.name == 'a' and not getattr(node, '_seen', False):
                if 'href' in node.attributes:
                    attrs = node.attributes
                    _text = attrs['_text'] = ''.join(_render(c) for
                                                     c in node.childNodes)
                    attrs = apply_callbacks(attrs, False)
                    if attrs is not None:
                        text = force_unicode(attrs.pop('_text'))
                        node.attributes = attrs
                        for n in node.childNodes:
                            node.removeChild(n)
                        node.insertText(text)
                        node._seen = True
                    else:
                        replace_nodes(tree, _text, node)
            elif skip_pre and node.name == 'pre':
                linkify_nodes(node, False)
            elif not getattr(node, '_seen', False):
                linkify_nodes(node)
            current += 1

    def email_repl(match):
        addr = match.group(0).replace('"', '&quot;')
        link = {
            '_text': addr,
            'href': 'mailto:%s' % addr,
        }
        link = apply_callbacks(link, True)

        if link is None:
            return addr

        _href = link.pop('href')
        _text = link.pop('_text')

        repl = '<a href="%s" %s>%s</a>'
        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
        return repl % (_href, attribs, _text)

    def link_repl(match):
        url = match.group(0)
        open_brackets = close_brackets = 0
        if url.startswith('('):
            url, open_brackets, close_brackets = (
                    strip_wrapping_parentheses(url)
            )
        end = u''
        m = re.search(punct_re, url)
        if m:
            end = m.group(0)
            url = url[0:m.start()]
        if re.search(proto_re, url):
            href = url
        else:
            href = u''.join([u'http://', url])

        link = {
            '_text': url,
            'href': href,
        }

        link = apply_callbacks(link, True)

        if link is None:
            return url

        _text = link.pop('_text')
        _href = link.pop('href')

        repl = u'%s<a href="%s" %s>%s</a>%s%s'
        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())

        return repl % ('(' * open_brackets,
                       _href, attribs, _text, end,
                       ')' * close_brackets)

    try:
        linkify_nodes(forest)
    except (RECURSION_EXCEPTION), e:
        # If we hit the max recursion depth, just return what we've got.
        log.error('Probable recursion error: %r' % e, exc_info=sys.exc_info())

    return _render(forest)

Example 74

Project: trackma Source File: libmal.py
Function: parse_xml
    def _parse_xml(self, data):
        # For some reason MAL returns an XML file with HTML exclusive
        # entities like &aacute;, so we have to create a custom XMLParser
        # to convert these entities correctly.

        ENTITIES = {
            "nbsp":     u'\u00A0',
            "iexcl":    u'\u00A1',
            "cent":     u'\u00A2',
            "pound":    u'\u00A3',
            "curren":   u'\u00A4',
            "yen":      u'\u00A5',
            "brvbar":   u'\u00A6',
            "sect":     u'\u00A7',
            "uml":      u'\u00A8',
            "copy":     u'\u00A9',
            "ordf":     u'\u00AA',
            "laquo":    u'\u00AB',
            "not":      u'\u00AC',
            "shy":      u'\u00AD',
            "reg":      u'\u00AE',
            "macr":     u'\u00AF',
            "deg":      u'\u00B0',
            "plusmn":   u'\u00B1',
            "sup2":     u'\u00B2',
            "sup3":     u'\u00B3',
            "acute":    u'\u00B4',
            "micro":    u'\u00B5',
            "para":     u'\u00B6',
            "middot":   u'\u00B7',
            "cedil":    u'\u00B8',
            "sup1":     u'\u00B9',
            "ordm":     u'\u00BA',
            "raquo":    u'\u00BB',
            "frac14":   u'\u00BC',
            "frac12":   u'\u00BD',
            "frac34":   u'\u00BE',
            "iquest":   u'\u00BF',
            "Agrave":   u'\u00C0',
            "Aacute":   u'\u00C1',
            "Acirc":    u'\u00C2',
            "Atilde":   u'\u00C3',
            "Auml":     u'\u00C4',
            "Aring":    u'\u00C5',
            "AElig":    u'\u00C6',
            "Ccedil":   u'\u00C7',
            "Egrave":   u'\u00C8',
            "Eacute":   u'\u00C9',
            "Ecirc":    u'\u00CA',
            "Euml":     u'\u00CB',
            "Igrave":   u'\u00CC',
            "Iacute":   u'\u00CD',
            "Icirc":    u'\u00CE',
            "Iuml":     u'\u00CF',
            "ETH":      u'\u00D0',
            "Ntilde":   u'\u00D1',
            "Ograve":   u'\u00D2',
            "Oacute":   u'\u00D3',
            "Ocirc":    u'\u00D4',
            "Otilde":   u'\u00D5',
            "Ouml":     u'\u00D6',
            "times":    u'\u00D7',
            "Oslash":   u'\u00D8',
            "Ugrave":   u'\u00D9',
            "Uacute":   u'\u00DA',
            "Ucirc":    u'\u00DB',
            "Uuml":     u'\u00DC',
            "Yacute":   u'\u00DD',
            "THORN":    u'\u00DE',
            "szlig":    u'\u00DF',
            "agrave":   u'\u00E0',
            "aacute":   u'\u00E1',
            "acirc":    u'\u00E2',
            "atilde":   u'\u00E3',
            "auml":     u'\u00E4',
            "aring":    u'\u00E5',
            "aelig":    u'\u00E6',
            "ccedil":   u'\u00E7',
            "egrave":   u'\u00E8',
            "eacute":   u'\u00E9',
            "ecirc":    u'\u00EA',
            "euml":     u'\u00EB',
            "igrave":   u'\u00EC',
            "iacute":   u'\u00ED',
            "icirc":    u'\u00EE',
            "iuml":     u'\u00EF',
            "eth":      u'\u00F0',
            "ntilde":   u'\u00F1',
            "ograve":   u'\u00F2',
            "oacute":   u'\u00F3',
            "ocirc":    u'\u00F4',
            "otilde":   u'\u00F5',
            "ouml":     u'\u00F6',
            "divide":   u'\u00F7',
            "oslash":   u'\u00F8',
            "ugrave":   u'\u00F9',
            "uacute":   u'\u00FA',
            "ucirc":    u'\u00FB',
            "uuml":     u'\u00FC',
            "yacute":   u'\u00FD',
            "thorn":    u'\u00FE',
            "yuml":     u'\u00FF',
            "fnof":     u'\u0192',
            "Alpha":    u'\u0391',
            "Beta":     u'\u0392',
            "Gamma":    u'\u0393',
            "Delta":    u'\u0394',
            "Epsilon":  u'\u0395',
            "Zeta":     u'\u0396',
            "Eta":      u'\u0397',
            "Theta":    u'\u0398',
            "Iota":     u'\u0399',
            "Kappa":    u'\u039A',
            "Lambda":   u'\u039B',
            "Mu":       u'\u039C',
            "Nu":       u'\u039D',
            "Xi":       u'\u039E',
            "Omicron":  u'\u039F',
            "Pi":       u'\u03A0',
            "Rho":      u'\u03A1',
            "Sigma":    u'\u03A3',
            "Tau":      u'\u03A4',
            "Upsilon":  u'\u03A5',
            "Phi":      u'\u03A6',
            "Chi":      u'\u03A7',
            "Psi":      u'\u03A8',
            "Omega":    u'\u03A9',
            "alpha":    u'\u03B1',
            "beta":     u'\u03B2',
            "gamma":    u'\u03B3',
            "delta":    u'\u03B4',
            "epsilon":  u'\u03B5',
            "zeta":     u'\u03B6',
            "eta":      u'\u03B7',
            "theta":    u'\u03B8',
            "iota":     u'\u03B9',
            "kappa":    u'\u03BA',
            "lambda":   u'\u03BB',
            "mu":       u'\u03BC',
            "nu":       u'\u03BD',
            "xi":       u'\u03BE',
            "omicron":  u'\u03BF',
            "pi":       u'\u03C0',
            "rho":      u'\u03C1',
            "sigmaf":   u'\u03C2',
            "sigma":    u'\u03C3',
            "tau":      u'\u03C4',
            "upsilon":  u'\u03C5',
            "phi":      u'\u03C6',
            "chi":      u'\u03C7',
            "psi":      u'\u03C8',
            "omega":    u'\u03C9',
            "thetasym": u'\u03D1',
            "upsih":    u'\u03D2',
            "piv":      u'\u03D6',
            "bull":     u'\u2022',
            "hellip":   u'\u2026',
            "prime":    u'\u2032',
            "Prime":    u'\u2033',
            "oline":    u'\u203E',
            "frasl":    u'\u2044',
            "weierp":   u'\u2118',
            "image":    u'\u2111',
            "real":     u'\u211C',
            "trade":    u'\u2122',
            "alefsym":  u'\u2135',
            "larr":     u'\u2190',
            "uarr":     u'\u2191',
            "rarr":     u'\u2192',
            "darr":     u'\u2193',
            "harr":     u'\u2194',
            "crarr":    u'\u21B5',
            "lArr":     u'\u21D0',
            "uArr":     u'\u21D1',
            "rArr":     u'\u21D2',
            "dArr":     u'\u21D3',
            "hArr":     u'\u21D4',
            "forall":   u'\u2200',
            "part":     u'\u2202',
            "exist":    u'\u2203',
            "empty":    u'\u2205',
            "nabla":    u'\u2207',
            "isin":     u'\u2208',
            "notin":    u'\u2209',
            "ni":       u'\u220B',
            "prod":     u'\u220F',
            "sum":      u'\u2211',
            "minus":    u'\u2212',
            "lowast":   u'\u2217',
            "radic":    u'\u221A',
            "prop":     u'\u221D',
            "infin":    u'\u221E',
            "ang":      u'\u2220',
            "and":      u'\u2227',
            "or":       u'\u2228',
            "cap":      u'\u2229',
            "cup":      u'\u222A',
            "int":      u'\u222B',
            "there4":   u'\u2234',
            "sim":      u'\u223C',
            "cong":     u'\u2245',
            "asymp":    u'\u2248',
            "ne":       u'\u2260',
            "equiv":    u'\u2261',
            "le":       u'\u2264',
            "ge":       u'\u2265',
            "sub":      u'\u2282',
            "sup":      u'\u2283',
            "nsub":     u'\u2284',
            "sube":     u'\u2286',
            "supe":     u'\u2287',
            "oplus":    u'\u2295',
            "otimes":   u'\u2297',
            "perp":     u'\u22A5',
            "sdot":     u'\u22C5',
            "lceil":    u'\u2308',
            "rceil":    u'\u2309',
            "lfloor":   u'\u230A',
            "rfloor":   u'\u230B',
            "lang":     u'\u2329',
            "rang":     u'\u232A',
            "loz":      u'\u25CA',
            "spades":   u'\u2660',
            "clubs":    u'\u2663',
            "hearts":   u'\u2665',
            "diams":    u'\u2666',
            "quot":     u'\"'    ,
            "amp":      u'&'     ,
            "lt":       u'<'     ,
            "gt":       u'>'     ,
            "OElig":    u'\u0152',
            "oelig":    u'\u0153',
            "Scaron":   u'\u0160',
            "scaron":   u'\u0161',
            "Yuml":     u'\u0178',
            "circ":     u'\u02C6',
            "tilde":    u'\u02DC',
            "ensp":     u'\u2002',
            "emsp":     u'\u2003',
            "thinsp":   u'\u2009',
            "zwnj":     u'\u200C',
            "zwj":      u'\u200D',
            "lrm":      u'\u200E',
            "rlm":      u'\u200F',
            "ndash":    u'\u2013',
            "mdash":    u'\u2014',
            "lsquo":    u'\u2018',
            "rsquo":    u'\u2019',
            "sbquo":    u'\u201A',
            "ldquo":    u'\u201C',
            "rdquo":    u'\u201D',
            "bdquo":    u'\u201E',
            "dagger":   u'\u2020',
            "Dagger":   u'\u2021',
            "permil":   u'\u2030',
            "lsaquo":   u'\u2039',
            "rsaquo":   u'\u203A',
            "euro":     u'\u20AC',
        }

        # http://stackoverflow.com/a/35591479/2016221
        magic = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
            "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [\n'''
        magic += ''.join("<!ENTITY %s '&#%d;'>\n" % (key, ord(value)) for key, value in ENTITIES.items())
        magic += '\n]>'

        # strip xml declaration since we're concatenating something before it
        data = re.sub('<\?.*?\?>', '', data)

        return ET.fromstring(magic + data)

Example 75

Project: QSTK Source File: html_colorize.py
Function: handle_line
    def handle_line(self, line):
        """
        Render a single logical line from the module, and write the
        generated HTML to C{self.out}.

        @param line: A single logical line, encoded as a list of
            C{(toktype,tokttext)} pairs corresponding to the tokens in
            the line.
        """
        # def_name is the name of the function or class defined by
        # this line; or None if no funciton or class is defined.
        def_name = None

        # def_type is the type of the function or class defined by
        # this line; or None if no funciton or class is defined.
        def_type = None

        # does this line start a class/func def?
        starting_def_block = False 

        in_base_list = False
        in_param_list = False
        in_param_default = 0
        at_module_top = (self.lineno == 1)

        ended_def_blocks = 0

        # The html output.
        if self.ADD_LINE_NUMBERS:
            s = self.lineno_to_html()
            self.lineno += 1
        else:
            s = ''
        s += '  <tt class="py-line">'

        # Loop through each token, and colorize it appropriately.
        for i, (toktype, toktext) in enumerate(line):
            if type(s) is not str:
                if type(s) is unicode:
                    log.error('While colorizing %s -- got unexpected '
                              'unicode string' % self.module_name)
                    s = s.encode('ascii', 'xmlcharrefreplace')
                else:
                    raise ValueError('Unexpected value for s -- %s' % 
                                     type(s).__name__)

            # For each token, determine its css class and whether it
            # should link to a url.
            css_class = None
            url = None
            tooltip = None
            onclick = uid = targets = None # these 3 are used together.

            # Is this token the class name in a class definition?  If
            # so, then make it a link back into the API docs.
            if i>=2 and line[i-2][1] == 'class':
                in_base_list = True
                css_class = self.CSS_CLASSES['DEFNAME']
                def_name = toktext
                def_type = 'class'
                if 'func' not in self.context_types:
                    cls_name = self.context_name(def_name)
                    url = self.name2url(cls_name)
                    s = self.mark_def(s, cls_name)
                    starting_def_block = True

            # Is this token the function name in a function def?  If
            # so, then make it a link back into the API docs.
            elif i>=2 and line[i-2][1] == 'def':
                in_param_list = True
                css_class = self.CSS_CLASSES['DEFNAME']
                def_name = toktext
                def_type = 'func'
                if 'func' not in self.context_types:
                    cls_name = self.context_name()
                    func_name = self.context_name(def_name)
                    url = self.name2url(cls_name, def_name)
                    s = self.mark_def(s, func_name)
                    starting_def_block = True

            # For each indent, update the indents list (which we use
            # to keep track of indentation strings) and the context
            # list.  If this indent is the start of a class or
            # function def block, then self.def_name will be its name;
            # otherwise, it will be None.
            elif toktype == token.INDENT:
                self.indents.append(toktext)
                self.context.append(self.def_name)
                self.context_types.append(self.def_type)

            # When we dedent, pop the last elements off the indents
            # list and the context list.  If the last context element
            # is a name, then we're ending a class or function def
            # block; so write an end-div tag.
            elif toktype == token.DEDENT:
                self.indents.pop()
                self.context_types.pop()
                if self.context.pop():
                    ended_def_blocks += 1

            # If this token contains whitespace, then don't bother to
            # give it a css tag.
            elif toktype in (None, tokenize.NL, token.NEWLINE,
                             token.ENDMARKER):
                css_class = None

            # Check if the token is a keyword.
            elif toktype == token.NAME and keyword.iskeyword(toktext):
                css_class = self.CSS_CLASSES['KEYWORD']

            elif in_base_list and toktype == token.NAME:
                css_class = self.CSS_CLASSES['BASECLASS']

            elif (in_param_list and toktype == token.NAME and
                  not in_param_default):
                css_class = self.CSS_CLASSES['PARAM']

            # Class/function docstring.
            elif (self.def_name and line[i-1][0] == token.INDENT and
                  self.is_docstring(line, i)):
                css_class = self.CSS_CLASSES['DOCSTRING']

            # Module docstring.
            elif at_module_top and self.is_docstring(line, i):
                css_class = self.CSS_CLASSES['DOCSTRING']

            # check for decorators??
            elif (toktype == token.NAME and
                  ((i>0 and line[i-1][1]=='@') or
                   (i>1 and line[i-1][0]==None and line[i-2][1] == '@'))):
                css_class = self.CSS_CLASSES['DECORATOR']
                self.has_decorators = True

            # If it's a name, try to link it.
            elif toktype == token.NAME:
                css_class = self.CSS_CLASSES['NAME']
                # If we have a variable named `toktext` in the current
                # context, then link to that.  Note that if we're inside
                # a function, then that function is our context, not
                # the namespace that contains it. [xx] this isn't always
                # the right thing to do.
                if (self.GUESS_LINK_TARGETS and self.docindex is not None
                    and self.url_func is not None):
                    context = [n for n in self.context if n is not None]
                    container = self.docindex.get_vardoc(
                        DottedName(self.module_name, *context))
                    if isinstance(container, NamespaceDoc):
                        doc = container.variables.get(toktext)
                        if doc is not None:
                            url = self.url_func(doc)
                            tooltip = str(doc.canonical_name)
                # Otherwise, check the name_to_docs index to see what
                # else this name might refer to.
                if (url is None and self.name_to_docs is not None
                    and self.url_func is not None):
                    docs = self.name_to_docs.get(toktext)
                    if docs:
                        tooltip='\n'.join([str(d.canonical_name)
                                           for d in docs])
                        if len(docs) == 1 and self.GUESS_LINK_TARGETS:
                            url = self.url_func(docs[0])
                        else:
                            uid, onclick, targets = self.doclink(toktext, docs)

            # For all other tokens, look up the CSS class to use
            # based on the token's type.
            else:
                if toktype == token.OP and toktext in self.CSS_CLASSES:
                    css_class = self.CSS_CLASSES[toktext]
                elif token.tok_name[toktype] in self.CSS_CLASSES:
                    css_class = self.CSS_CLASSES[token.tok_name[toktype]]
                else:
                    css_class = None

            # update our status..
            if toktext == ':':
                in_base_list = False
                in_param_list = False
            if toktext == '=' and in_param_list:
                in_param_default = True
            if in_param_default:
                if toktext in ('(','[','{'): in_param_default += 1
                if toktext in (')',']','}'): in_param_default -= 1
                if toktext == ',' and in_param_default == 1:
                    in_param_default = 0
                
            # Write this token, with appropriate colorization.
            if tooltip and self.ADD_TOOLTIPS:
                tooltip_html = ' title="%s"' % tooltip
            else: tooltip_html = ''
            if css_class: css_class_html = ' class="%s"' % css_class
            else: css_class_html = ''
            if onclick:
                if targets: targets_html = ' targets="%s"' % targets
                else: targets_html = ''
                s += ('<tt id="%s"%s%s><a%s%s href="#" onclick="%s">' %
                      (uid, css_class_html, targets_html, tooltip_html,
                       css_class_html, onclick))
            elif url:
                if isinstance(url, unicode):
                    url = url.encode('ascii', 'xmlcharrefreplace')
                s += ('<a%s%s href="%s">' %
                      (tooltip_html, css_class_html, url))
            elif css_class_html or tooltip_html:
                s += '<tt%s%s>' % (tooltip_html, css_class_html)
            if i == len(line)-1:
                s += ' </tt>' # Closes <tt class="py-line">
                s += cgi.escape(toktext)
            else:
                try:
                    s += self.add_line_numbers(cgi.escape(toktext), css_class)
                except Exception, e:
                    print (toktext, css_class, toktext.encode('ascii'))
                    raise

            if onclick: s += "</a></tt>"
            elif url: s += '</a>'
            elif css_class_html or tooltip_html: s += '</tt>'

        if self.ADD_DEF_BLOCKS:
            for i in range(ended_def_blocks):
                self.out(self.END_DEF_BLOCK)

        # Strip any empty <tt>s.
        s = re.sub(r'<tt class="[\w+]"></tt>', '', s)

        # Write the line.
        self.out(s)

        if def_name and starting_def_block:
            self.out('</div>')

        # Add div's if we're starting a def block.
        if (self.ADD_DEF_BLOCKS and def_name and starting_def_block and
            (line[-2][1] == ':')):
            indentation = (''.join(self.indents)+'    ').replace(' ', '+')
            linenum_padding = '+'*self.linenum_size
            name=self.context_name(def_name)
            self.out(self.START_DEF_BLOCK % (name, linenum_padding,
                                             indentation, name))
            
        self.def_name = def_name
        self.def_type = def_type

Example 76

Project: brython Source File: markdown.py
Function: mark
def mark(src):

    global refs
    t0 = time.time()
    refs = {}
    # split source in sections
    # sections can be :
    # - a block-level HTML element (markdown syntax will not be processed)
    # - a script
    # - a span-level HTML tag (markdown syntax will be processed)
    # - a code block
    
    # normalise line feeds
    src = src.replace('\r\n','\n')
    
    # lines followed by dashes
    src = re.sub(r'(.*?)\n=+\n', '\n# \\1\n', src)
    src = re.sub(r'(.*?)\n-+\n', '\n## \\1\n', src) 

    lines = src.split('\n')+['']
    
    i = bq = 0
    ul = ol = 0
    
    while i<len(lines):

        # enclose lines starting by > in a blockquote
        if lines[i].startswith('>'):
            nb = 1
            while nb<len(lines[i]) and lines[i][nb]=='>':
                nb += 1
            lines[i] = lines[i][nb:]
            if nb>bq:
                lines.insert(i,'<blockquote>'*(nb-bq))
                i += 1
                bq = nb
            elif nb<bq:
                lines.insert(i,'</blockquote>'*(bq-nb))
                i += 1
                bq = nb
        elif bq>0:
            lines.insert(i,'</blockquote>'*bq)
            i += 1
            bq = 0

        # unordered lists
        if lines[i].strip() and lines[i].lstrip()[0] in '-+*' \
            and len(lines[i].lstrip())>1 \
            and lines[i].lstrip()[1]==' ' \
            and (i==0 or ul or not lines[i-1].strip()):
            # line indentation indicates nesting level
            nb = 1+len(lines[i])-len(lines[i].lstrip())
            lines[i] = '<li>'+lines[i][nb:]
            if nb>ul:
                lines.insert(i,'<ul>'*(nb-ul))
                i += 1
            elif nb<ul:
                lines.insert(i,'</ul>'*(ul-nb))
                i += 1
            ul = nb
        elif ul and not lines[i].strip():
            if i<len(lines)-1 and lines[i+1].strip() \
                and not lines[i+1].startswith(' '):
                    nline = lines[i+1].lstrip()
                    if nline[0] in '-+*' and len(nline)>1 and nline[1]==' ':
                        pass
                    else:
                        lines.insert(i,'</ul>'*ul)
                        i += 1
                        ul = 0

        # ordered lists
        mo = re.search(r'^(\d+\.)',lines[i])
        if mo:
            if not ol:
                lines.insert(i,'<ol>')
                i += 1
            lines[i] = '<li>'+lines[i][len(mo.groups()[0]):]
            ol = 1
        elif ol and not lines[i].strip() and i<len(lines)-1 \
            and not lines[i+1].startswith(' ') \
            and not re.search(r'^(\d+\.)',lines[i+1]):
            lines.insert(i,'</ol>')
            i += 1
            ol = 0
        
        i += 1
    
    if ul:
        lines.append('</ul>'*ul)
    if ol:
        lines.append('</ol>'*ol)
    if bq:
        lines.append('</blockquote>'*bq)

    t1 = time.time()
    #print('part 1', t1-t0)    
    sections = []
    scripts = []
    section = Marked()

    i = 0
    while i<len(lines):
        line = lines[i]
        if line.strip() and line.startswith('    '):
            if isinstance(section,Marked) and section.line:
                sections.append(section)
            section = CodeBlock(line[4:])
            j = i+1
            while j<len(lines) and lines[j].startswith('    '):
                section.lines.append(lines[j][4:])
                j += 1
            sections.append(section)
            section = Marked()
            i = j   
            continue

        elif line.strip() and line.startswith("```"):
            # fenced code blocks à la Github Flavoured Markdown
            if isinstance(section,Marked) and section.line:
                sections.append(section)
            section = CodeBlock(line)
            j = i+1
            while j<len(lines) and not lines[j].startswith("```"):
                section.lines.append(lines[j])
                j += 1
            sections.append(section)
            section = Marked()
            i = j+1
            continue

        elif line.lower().startswith('<script'):
            if isinstance(section,Marked) and section.line:
                sections.append(section)
                section = Marked()
            j = i+1
            while j<len(lines):
                if lines[j].lower().startswith('</script>'):
                    scripts.append('\n'.join(lines[i+1:j]))
                    for k in range(i,j+1):
                        lines[k] = ''
                    break
                j += 1
            i = j
            continue

        # atext header
        elif line.startswith('#'):
            level = 1
            line = lines[i]
            while level<len(line) and line[level]=='#' and level<=6:
                level += 1
            if not line[level+1:].strip():
                if level==1:
                    i += 1
                    continue
                else:
                    lines[i] = '<H%s>%s</H%s>\n' %(level-1,'#',level-1)
            else:
                lines[i] = '<H%s>%s</H%s>\n' %(level,line[level+1:],level)

        else:
            mo = re.search(ref_pattern,line)
            if mo is not None:
                if isinstance(section,Marked) and section.line:
                    sections.append(section)
                    section = Marked()
                key = mo.groups()[0]
                value = URL(mo.groups()[1])
                refs[key.lower()] = value
            else:
                if not line.strip():
                    line = '<p></p>'
                if section.line:
                    section.line += '\n'
                section.line += line
                    
            i += 1
    t2 = time.time()
    #print('section 2', t2-t1)
    if isinstance(section,Marked) and section.line:
        sections.append(section)

    res = ''
    for section in sections:
        mk,_scripts = section.to_html()
        res += mk
        scripts += _scripts
    #print('end mark', time.time()-t2)
    return res,scripts

Example 77

Project: ZenPacks.zenoss.OpenStackInfrastructure Source File: utils.py
def create_model_data(dmd):
    '''
    Return an Endpoint suitable for Impact functional testing.
    '''
    # DeviceClass
    dc = dmd.Devices.createOrganizer('/OpenStack/Infrastructure')
    dc.setZenProperty('zPythonClass', 'ZenPacks.zenoss.OpenStackInfrastructure.Endpoint')

    # OSProcessClasses
    osc = dmd.Processes.createOrganizer('/OpenStack')
    for binary in ['nova-cert', 'nova-conductor', 'nova-consoleauth', 'nova-scheduler', 'nova-compute', 'nova-api']:
        osc.manage_addOSProcessClass(binary)

    # Endpoint
    endpoint = dc.createInstance('endpoint')

    # Org Structure
    from ZenPacks.zenoss.OpenStackInfrastructure.Region import Region
    from ZenPacks.zenoss.OpenStackInfrastructure.AvailabilityZone import AvailabilityZone
    region = addContained(endpoint, "components", Region("region"))
    zone1 = addContained(endpoint, "components", AvailabilityZone("zone1"))
    zone2 = addContained(endpoint, "components", AvailabilityZone("zone2"))
    addNonContained(region, "childOrgs", zone1)
    addNonContained(region, "childOrgs", zone2)

    # Tenants
    from ZenPacks.zenoss.OpenStackInfrastructure.Tenant import Tenant
    tenant1 = addContained(endpoint, "components", Tenant("tenant-tenant1"))
    tenant2 = addContained(endpoint, "components", Tenant("tenant-tenant2"))

    # Flavor
    from ZenPacks.zenoss.OpenStackInfrastructure.Flavor import Flavor
    flavor1 = addContained(endpoint, "components", Flavor("flavor1"))

    # Image
    from ZenPacks.zenoss.OpenStackInfrastructure.Image import Image
    image1 = addContained(endpoint, "components", Image("image1"))

    # Host
    from ZenPacks.zenoss.OpenStackInfrastructure.Host import Host
    computehost1 = addContained(endpoint, "components", Host("computehost1"))
    addNonContained(computehost1, "orgComponent", zone1)
    computehost2 = addContained(endpoint, "components", Host("computehost2"))
    addNonContained(computehost2, "orgComponent", zone2)
    controllerhost = addContained(endpoint, "components", Host("controllerhost"))
    addNonContained(controllerhost, "orgComponent", zone1)

    # SoftwareComponents
    from ZenPacks.zenoss.OpenStackInfrastructure.NovaService import NovaService
    from ZenPacks.zenoss.OpenStackInfrastructure.NovaApi import NovaApi
    nova_consoleauth = addContained(endpoint, "components", NovaService("nova-consoleauth"))
    nova_consoleauth.binary = 'nova-consoleauth'
    addNonContained(nova_consoleauth, "hostedOn", controllerhost)
    addNonContained(nova_consoleauth, "orgComponent", zone1)
    nova_scheduler = addContained(endpoint, "components", NovaService("nova-scheduler"))
    nova_scheduler.binary = 'nova-scheduler'
    addNonContained(nova_scheduler, "hostedOn", controllerhost)
    addNonContained(nova_scheduler, "orgComponent", zone1)
    nova_conductor1 = addContained(endpoint, "components", NovaService("nova-conductor1"))
    nova_conductor1.binary = 'nova-conductor'
    nova_conductor2 = addContained(endpoint, "components", NovaService("nova-conductor2"))
    nova_conductor2.binary = 'nova-conductor'
    addNonContained(nova_conductor1, "hostedOn", computehost1)
    addNonContained(nova_conductor1, "orgComponent", zone1)
    addNonContained(nova_conductor2, "hostedOn", computehost2)
    addNonContained(nova_conductor2, "orgComponent", zone2)
    nova_compute1 = addContained(endpoint, "components", NovaService("nova-compute1"))
    nova_compute1.binary = 'nova-compute'
    nova_compute2 = addContained(endpoint, "components", NovaService("nova-compute2"))
    nova_compute2.binary = 'nova-compute'
    addNonContained(nova_compute1, "hostedOn", computehost1)
    addNonContained(nova_compute1, "orgComponent", zone1)
    addNonContained(nova_compute2, "hostedOn", computehost2)
    addNonContained(nova_compute2, "orgComponent", zone2)
    nova_cert = addContained(endpoint, "components", NovaService("nova-cert"))
    nova_cert.binary = 'nova-cert'
    addNonContained(nova_cert, "hostedOn", controllerhost)
    addNonContained(nova_cert, "orgComponent", zone1)
    nova_api = addContained(endpoint, "components", NovaApi("nova-api"))
    nova_api.binary = 'nova-api'
    addNonContained(nova_api, "hostedOn", controllerhost)
    addNonContained(nova_api, "orgComponent", region)

    # Hypervisor
    from ZenPacks.zenoss.OpenStackInfrastructure.Hypervisor import Hypervisor
    hypervisor1 = addContained(endpoint, "components", Hypervisor("hypervisor1"))
    hypervisor2 = addContained(endpoint, "components", Hypervisor("hypervisor2"))
    addNonContained(hypervisor1, "host", computehost1)
    addNonContained(hypervisor2, "host", computehost2)

    # Instance
    from ZenPacks.zenoss.OpenStackInfrastructure.Instance import Instance
    instance1 = addContained(endpoint, "components", Instance("instance1"))
    instance2 = addContained(endpoint, "components", Instance("instance2"))
    instance3 = addContained(endpoint, "components", Instance("instance3"))
    instance4 = addContained(endpoint, "components", Instance("instance4"))
    addNonContained(instance1, "flavor", flavor1)
    addNonContained(instance2, "flavor", flavor1)
    addNonContained(instance3, "flavor", flavor1)
    addNonContained(instance4, "flavor", flavor1)
    addNonContained(instance1, "image", image1)
    addNonContained(instance2, "image", image1)
    addNonContained(instance3, "image", image1)
    addNonContained(instance4, "image", image1)
    addNonContained(instance1, "hypervisor", hypervisor1)
    addNonContained(instance2, "hypervisor", hypervisor1)
    addNonContained(instance3, "hypervisor", hypervisor2)
    addNonContained(instance4, "hypervisor", hypervisor2)
    addNonContained(instance1, "tenant", tenant1)
    addNonContained(instance2, "tenant", tenant2)
    addNonContained(instance3, "tenant", tenant1)
    addNonContained(instance4, "tenant", tenant2)

    # Vnic
    from ZenPacks.zenoss.OpenStackInfrastructure.Vnic import Vnic
    instance1vnic1 = addContained(instance1, "vnics", Vnic("instance1_vnic1"))
    instance1vnic1.macaddress = 'de:ad:be:ef:01:01'
    instance1vnic1.index_object()
    instance1vnic2 = addContained(instance1, "vnics", Vnic("instance1_vnic2"))
    instance1vnic2.macaddress = 'de:ad:be:ef:01:02'
    instance1vnic2.index_object()
    instance2vnic1 = addContained(instance2, "vnics", Vnic("instance2_vnic1"))
    instance2vnic1.macaddress = 'de:ad:be:ef:02:01'
    instance2vnic1.index_object()
    instance2vnic2 = addContained(instance2, "vnics", Vnic("instance2_vnic2"))
    instance2vnic2.macaddress = 'de:ad:be:ef:02:02'
    instance2vnic2.index_object()
    instance3vnic1 = addContained(instance3, "vnics", Vnic("instance3_vnic1"))
    instance3vnic1.macaddress = 'de:ad:be:ef:03:01'
    instance3vnic1.index_object()
    instance3vnic2 = addContained(instance3, "vnics", Vnic("instance3_vnic2"))
    instance3vnic2.macaddress = 'de:ad:be:ef:03:02'
    instance3vnic2.index_object()
    instance4vnic1 = addContained(instance4, "vnics", Vnic("instance4_vnic1"))
    instance4vnic1.macaddress = 'de:ad:be:ef:04:01'
    instance4vnic1.index_object()
    instance4vnic2 = addContained(instance4, "vnics", Vnic("instance4_vnic2"))
    instance4vnic2.macaddress = 'de:ad:be:ef:04:02'
    instance4vnic2.index_object()

    # Linux guest devices (Virtual)
    # make sure that the interfaces line up.
    guest_dc = dmd.Devices.createOrganizer('/Server/SSH/Linux')
    guest_dc.setZenProperty('zPythonClass', 'Products.ZenModel.Device')
    guest_instance1 = guest_dc.createInstance("g-instance1")
    guest_instance2 = guest_dc.createInstance("g-instance2")
    guest_instance3 = guest_dc.createInstance("g-instance3")
    # instance4 is not monitored by zenoss.

    from Products.ZenModel.IpInterface import IpInterface

    def add_linux_interface_mac(device, interface_name, macaddress):
        eth_if = IpInterface(interface_name)
        device.os.interfaces._setObject(eth_if.id, eth_if)
        eth_if = device.os.interfaces._getOb(eth_if.id)
        eth_if.macaddress = macaddress
        eth_if.index_object()
        device.index_object()

    add_linux_interface_mac(guest_instance1, 'eth0', 'de:ad:be:ef:01:01')
    add_linux_interface_mac(guest_instance1, 'eth1', 'de:ad:be:ef:01:02')
    add_linux_interface_mac(guest_instance2, 'eth0', 'de:ad:be:ef:02:01')
    add_linux_interface_mac(guest_instance2, 'eth1', 'de:ad:be:ef:02:02')
    add_linux_interface_mac(guest_instance3, 'eth0', 'de:ad:be:ef:03:01')
    add_linux_interface_mac(guest_instance3, 'eth1', 'de:ad:be:ef:03:02')

    # Linux devices (Physical)
    # (link to host1 and host2)
    phys_dc = dmd.Devices.createOrganizer('/Server/SSH/Linux/NovaHost')
    phys_dc.setZenProperty('zPythonClass', 'Products.ZenModel.Device')
    phys_computehost1 = phys_dc.createInstance("p-computehost1")
    phys_computehost2 = phys_dc.createInstance("p-computehost2")
    phys_controllerhost = phys_dc.createInstance("p-controllerhost")

    # Link the host components to the physical hosts.
    computehost1.claim_proxy_device(phys_computehost1)
    computehost2.claim_proxy_device(phys_computehost2)
    controllerhost.claim_proxy_device(phys_controllerhost)

    # Add OSprocesses for each of the software components.
    from ZenPacks.zenoss.OpenStackInfrastructure.SoftwareComponent import SoftwareComponent
    from Products.ZenModel.OSProcess import OSProcess
    for component in endpoint.components():
        if isinstance(component, SoftwareComponent):
            binary = component.binary
            linux_device = component.hostedOn().proxy_device()

            process_id = '%s_%s' % (linux_device.id, binary)
            process = OSProcess(process_id)
            linux_device.os.processes._setObject(process_id, process)
            process = linux_device.os.processes._getOb(process_id)

            process_class = re.sub(r'\d+$', '', binary)
            process.setOSProcessClass("Processes/OpenStack/osProcessClasses/%s" % process_class)


    # Cinder
    from ZenPacks.zenoss.OpenStackInfrastructure.Volume import Volume
    from ZenPacks.zenoss.OpenStackInfrastructure.VolSnapshot import VolSnapshot
    volume1 = addContained(endpoint, "components", Volume("volume1"))
    volsnap1 = addContained(endpoint, "components", VolSnapshot("volsnap1"))
    addNonContained(instance1, "volumes", volume1)
    addNonContained(volume1, "volSnapshots", volsnap1)

    return {
        'endpoint': endpoint,
        'phys_dc': phys_dc,
        'guest_dc': guest_dc
    }

Example 78

Project: pelisalacarta Source File: zentorrents.py
def fanart(item):
    logger.info("pelisalacarta.peliculasdk fanart")
    itemlist = []
    url = item.url
    data = scrapertools.cachePage(url)
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
    if "peliculas" in item.url:
    
        if "microhd" in url or "web" in url or "1080" in url or "bluray" in url or  "HDRip" in item.title:
            title= scrapertools.get_match(data,'<title>([^"]+) \[')
            title= re.sub(r"3D|[0-9]|SBS|\(.*?\)|\[.*?\]|","",title)
            title=title.replace('Perdón','perdon')
            title= title.replace(' ','%20')
        
    
        else:
                
            title= scrapertools.get_match(data,'<title>([^"]+) -')
            title= re.sub(r"3D|[0-9]|SBS|\(.*?\)|\[.*?\]|","",title)
            title= title.replace('á','a')
            title= title.replace('Á','A')
            title= title.replace('é','e')
            title= title.replace('í','i')
            title= title.replace('ó','o')
            title= title.replace('ú','u')
            title= title.replace('ñ','n')
            title= title.replace(' ','%20')

        url="http://api.themoviedb.org/3/search/movie?api_key=2e2160006592024ba87ccdf78c28f49f&query=" + title + "&language=es&include_adult=false"
        data = scrapertools.cachePage(url)
        data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
        patron = '"page":1.*?,"id":(.*?),.*?"backdrop_path":"\\\(.*?)"'
        matches = re.compile(patron,re.DOTALL).findall(data)
        if len(matches)==0:
            extra=item.thumbnail
            show= item.thumbnail
            posterdb = item.thumbnail
            fanart_info = item.thumbnail
            fanart_trailer = item.thumbnail
            category= item.thumbnail
            itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="findvideos", thumbnail=item.thumbnail, fanart=item.thumbnail,extra = extra, show= show, category= category,folder=True) )
        
        for id, fan in matches:
            try:
                posterdb = scrapertools.get_match(data,'"page":1,.*?"poster_path":"\\\(.*?)"')
                posterdb =  "https://image.tmdb.org/t/p/original" + posterdb
            except:
                posterdb = item.thumbnail
            fanart="https://image.tmdb.org/t/p/original" + fan
            item.extra= fanart
            url ="http://api.themoviedb.org/3/movie/"+id+"/images?api_key=2e2160006592024ba87ccdf78c28f49f"
            data = scrapertools.cachePage(url)
            data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
            
            patron = '"backdrops".*?"file_path":".*?",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)"'
            matches = re.compile(patron,re.DOTALL).findall(data)
                    
            if len(matches) == 0:
                patron = '"backdrops".*?"file_path":"(.*?)",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)"'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if len(matches) == 0:
                    fanart_info = item.extra
                    fanart_trailer = item.extra
                    fanart_2 = item.extra
            for fanart_info, fanart_trailer, fanart_2 in matches:
                fanart_info = "https://image.tmdb.org/t/p/original" + fanart_info
                fanart_trailer = "https://image.tmdb.org/t/p/original" + fanart_trailer
                fanart_2 = "https://image.tmdb.org/t/p/original" + fanart_2
                        
            #clearart, fanart_2 y logo
            url ="http://webservice.fanart.tv/v3/movies/"+id+"?api_key=dffe90fba4d02c199ae7a9e71330c987"
            data = scrapertools.cachePage(url)
            data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
            patron = '"hdmovielogo":.*?"url": "([^"]+)"'
            matches = re.compile(patron,re.DOTALL).findall(data)
            if '"moviedisc"' in data:
                disc = scrapertools.get_match(data,'"moviedisc":.*?"url": "([^"]+)"')
            if '"movieposter"' in data:
                poster = scrapertools.get_match(data,'"movieposter":.*?"url": "([^"]+)"')
            if '"moviethumb"' in data:
                thumb = scrapertools.get_match(data,'"moviethumb":.*?"url": "([^"]+)"')
            if '"moviebanner"' in data:
                banner= scrapertools.get_match(data,'"moviebanner":.*?"url": "([^"]+)"')
            
            if len(matches)==0:
                extra=  posterdb
                show = fanart_2
                category = item.extra
                itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=item.thumbnail, fanart=item.extra, extra=extra, show=show, category= category, folder=True) )
        for logo in matches:
             if '"hdmovieclearart"' in data:
                  clear=scrapertools.get_match(data,'"hdmovieclearart":.*?"url": "([^"]+)"')
                  if '"moviebackground"' in data:
                      
                      extra=clear
                      show= fanart_2
                      if '"moviedisc"' in data:
                           category= disc
                      else:
                           category= clear
                      itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra, extra=extra,show=show,  category= category,folder=True) )
                  else:
                        extra= clear
                        show=fanart_2
                        if '"moviedisc"' in data:
                             category= disc
                        else:
                            category= clear
                        itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra, extra=extra,show=show,  category= category, folder=True) )
                    
             if '"moviebackground"' in data:
                 
                  if '"hdmovieclearart"' in data:
                       clear=scrapertools.get_match(data,'"hdmovieclearart":.*?"url": "([^"]+)"')
                       extra=clear
                       show= fanart_2
                       if '"moviedisc"' in data:
                            category= disc
                       else:
                            category= clear
                  else:
                        extra=logo
                        show= fanart_2
                        if '"moviedisc"' in data:
                            category= disc
                        else:
                            category= logo
                        itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra, extra=extra,show=show,  category= category, folder=True) )
                    
                    
                    
                    
             if not '"hdmovieclearart"' in data and not '"moviebackground"' in data:
                      extra= logo
                      show=  fanart_2
                      if '"moviedisc"' in data:
                           category= disc
                      else:
                           category= item.extra
                      itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=logo, fanart=item.extra, extra=extra,show=show ,  category= category, folder=True) )
       
    else:
        if "series" in item.url:
            if "hdtv" in item.url or "720" in item.title or "1080p" in item.title:
                title= scrapertools.get_match(data,'<title>([^"]+) \[')
                title= re.sub(r"3D|'|,|[0-9]|#|;|\[.*?\]|SBS|-|","",title)
                title= title.replace('Temporada','')
                title= title.replace('Fin','')
                title= title.replace('x','')
                title= title.replace('Heli','Helix')
                title= title.replace('Anatomía','Anatomia')
                title= title.replace('á','a')
                title= title.replace('Á','A')
                title= title.replace('é','e')
                title= title.replace('í','i')
                title= title.replace('ó','o')
                title= title.replace('ú','u')
                title= title.replace('ñ','n')
                title= title.replace(' ','%20')
            
            
            else:
                title= scrapertools.get_match(data,'<title>([^"]+) -')
                title= re.sub(r"3D|'|,|[0-9]|#|;|´|VOSE|\[.*?\]|-|","",title)
                title= title.replace('Temporada','')
                title= title.replace('Fin','')
                title= title.replace('x','')
                title= title.replace('á','a')
                title= title.replace('Á','A')
                title= title.replace('é','e')
                title= title.replace('í','i')
                title= title.replace('ó','o')
                title= title.replace('ú','u')
                title= title.replace('ñ','n')
                title= title.replace('Anatomía','Anatomia')
                title= title.replace(' ','%20')
                
        url="http://thetvdb.com/api/GetSeries.php?seriesname=" + title + "&language=es"
        if "Erase%20una%20vez%20%20" in title:
            url ="http://thetvdb.com/api/GetSeries.php?seriesname=Erase%20una%20vez%20(2011)&language=es"
        data = scrapertools.cachePage(url)
        data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
        patron = '<Data><Series><seriesid>([^<]+)</seriesid>'
        matches = re.compile(patron,re.DOTALL).findall(data)
        if len(matches)==0:
            extra= item.thumbnail
            show=  item.thumbnail
            fanart_info = item.thumbnail
            fanart_trailer = item.thumbnail
            category= ""
            itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="findvideos", thumbnail=item.thumbnail, fanart=item.thumbnail ,extra=extra, category= category,  show=show , folder=True) )
        else:
            for id in matches:
                category = id
                id_serie = id
                url ="http://thetvdb.com/api/1D62F2F90030C444/series/"+id_serie+"/banners.xml"
                data = scrapertools.cachePage(url)
                data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                patron = '<Banners><Banner>.*?<VignettePath>(.*?)</VignettePath>'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if len(matches)==0:
                    extra=item.thumbnail
                    show= item.thumbnail
                    fanart_info = item.thumbnail
                    fanart_trailer = item.thumbnail
                    itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="findvideos", thumbnail=item.thumbnail, fanart=item.thumbnail ,category = category, extra=extra, show=show, folder=True) )
            for fan in matches:
                fanart="http://thetvdb.com/banners/" + fan
                item.extra= fanart
                patron= '<Banners><Banner>.*?<BannerPath>.*?</BannerPath>.*?</Banner><Banner>.*?<BannerPath>(.*?)</BannerPath>.*?</Banner><Banner>.*?<BannerPath>(.*?)</BannerPath>.*?</Banner><Banner>.*?<BannerPath>(.*?)</BannerPath>'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if len(matches)==0:
                    fanart_info= item.extra
                    fanart_trailer = item.extra
                    fanart_2 = item.extra
                for fanart_info, fanart_trailer, fanart_2 in matches:
                    fanart_info = "http://thetvdb.com/banners/" + fanart_info
                    fanart_trailer = "http://thetvdb.com/banners/" + fanart_trailer
                    fanart_2 = "http://thetvdb.com/banners/" + fanart_2
            #clearart, fanart_2 y logo
            for id in matches:
                url ="http://webservice.fanart.tv/v3/tv/"+id_serie+"?api_key=dffe90fba4d02c199ae7a9e71330c987"
                if "Castle" in title:
                    url ="http://webservice.fanart.tv/v3/tv/83462?api_key=dffe90fba4d02c199ae7a9e71330c987"
                data = scrapertools.cachePage(url)
                data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                patron = '"clearlogo":.*?"url": "([^"]+)"'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if '"tvposter"' in data:
                    tvposter = scrapertools.get_match(data,'"tvposter":.*?"url": "([^"]+)"')
                if '"tvbanner"' in data:
                    tvbanner = scrapertools.get_match(data,'"tvbanner":.*?"url": "([^"]+)"')
                if '"tvthumb"' in data:
                    tvthumb = scrapertools.get_match(data,'"tvthumb":.*?"url": "([^"]+)"')
                if '"hdtvlogo"' in data:
                    hdtvlogo = scrapertools.get_match(data,'"hdtvlogo":.*?"url": "([^"]+)"')
                if '"hdclearart"' in data:
                    hdtvclear = scrapertools.get_match(data,'"hdclearart":.*?"url": "([^"]+)"')
                if len(matches)==0:
                    if '"hdtvlogo"' in data:
                        if "showbackground" in data:
                            
                            if '"hdclearart"' in data:
                                thumbnail = hdtvlogo
                                extra=  hdtvclear
                                show = fanart_2
                            else:
                                thumbnail = hdtvlogo
                                extra= thumbnail
                                show = fanart_2
                            itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=thumbnail, fanart=item.extra, category=category, extra=extra, show=show, folder=True) )
                                                                        
                                                                        
                        else:
                            if '"hdclearart"' in data:
                                thumbnail= hdtvlogo
                                extra= hdtvclear
                                show= fanart_2
                            else:
                                thumbnail= hdtvlogo
                                extra= thumbnail
                                show= fanart_2
                            itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=thumbnail, fanart=item.extra, extra=extra, show=show,  category= category, folder=True) )
                    else:
                         extra=  item.thumbnail
                         show = fanart_2
                         itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url,  server="torrent", thumbnail=item.thumbnail, fanart=item.extra, extra=extra, show=show, category = category, folder=True) )
                                                                                                                                
            for logo in matches:
                if '"hdtvlogo"' in data:
                    thumbnail = hdtvlogo
                elif not '"hdtvlogo"' in data :
                           if '"clearlogo"' in data:
                            thumbnail= logo
                else:
                     thumbnail= item.thumbnail
                if '"clearart"' in data:
                    clear=scrapertools.get_match(data,'"clearart":.*?"url": "([^"]+)"')
                    if "showbackground" in data:
                                
                        extra=clear
                        show= fanart_2
                        itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=thumbnail, fanart=item.extra, extra=extra,show=show, category= category,  folder=True) )
                    else:
                         extra= clear
                         show=fanart_2
                         itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=thumbnail, fanart=item.extra, extra=extra,show=show, category= category, folder=True) )
                                     
                if "showbackground" in data:
                            
                    if '"clearart"' in data:
                        clear=scrapertools.get_match(data,'"clearart":.*?"url": "([^"]+)"')
                        extra=clear
                        show= fanart_2
                    else:
                        extra=logo
                        show= fanart_2
                        itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=thumbnail, fanart=item.extra, extra=extra,show=show,  category = category, folder=True) )
                                     
                if not '"clearart"' in data and not '"showbackground"' in data:
                        if '"hdclearart"' in data:
                            extra= hdtvclear
                            show= fanart_2
                        else:
                            extra= thumbnail
                            show=  fanart_2
                        itemlist.append( Item(channel=item.channel, title = item.title , action="findvideos", url=item.url, server="torrent", thumbnail=thumbnail, fanart=item.extra, extra=extra,show=show , category = category, folder=True) )
    

    title ="Info"
    title = title.replace(title,bbcode_kodi2html("[COLOR skyblue]"+title+"[/COLOR]"))
    if not "series" in item.url:
       thumbnail = posterdb
    if "series" in item.url:
        if '"tvposter"' in data:
            thumbnail= tvposter
        else:
            thumbnail = item.thumbnail
        
        if "tvbanner" in data:
            category = tvbanner
        else:
            
            category = show


    itemlist.append( Item(channel=item.channel, action="info" , title=title , url=item.url, thumbnail=thumbnail, fanart=fanart_info, extra= extra, category = category, show= show, folder=False ))

    title= "[COLOR cadetblue]Trailer[/COLOR]"
    if len(item.extra)==0:
        fanart=item.thumbnail
    else:
        fanart = item.extra
    if "series" in item.url:
        if '"tvthumb"' in data:
            thumbnail = tvthumb
        else:
            thumbnail = item.thumbnail
        if '"tvbanner"' in data:
            extra= tvbanner
        elif '"tvthumb"' in data:
            extra = tvthumb
        else:
            extra = item.thumbnail
    else:
        if '"moviethumb"' in data:
            thumbnail = thumb
        else:
            thumbnail = posterdb
        
        if '"moviebanner"' in data:
            extra= banner
        else:
            if '"hdmovieclearart"' in data:
                extra = clear
            
            else:
                extra = posterdb


    itemlist.append( Item(channel=item.channel, action="trailer", title=title , url=item.url , thumbnail=thumbnail , plot=item.plot , fanart=fanart_trailer, extra=extra, folder=True) )
    return itemlist

Example 79

Project: canvas Source File: rjsmin.py
Function: make_jsmin
def _make_jsmin(extended=True, python_only=False):
    """
    Generate JS minifier based on `jsmin.c by Douglas Crockford`_

    .. _jsmin.c by Douglas Crockford:
       http://www.crockford.com/javascript/jsmin.c

    :Parameters:
      `extended` : ``bool``
        Extended Regexps? (using lookahead and lookbehind). This is faster,
        because it can be optimized way more. The regexps used with `extended`
        being false are only left here to allow easier porting to platforms
        without extended regex features (and for my own reference...)

      `python_only` : ``bool``
        Use only the python variant. If true, the c extension is not even
        tried to be loaded.

    :Return: Minifier
    :Rtype: ``callable``
    """
    # pylint: disable = R0912, R0914, W0612
    if not python_only:
        try:
            import _rjsmin
        except ImportError:
            pass
        else:
            return _rjsmin.jsmin
    try:
        xrange
    except NameError:
        xrange = range # pylint: disable = W0622

    space_chars = r'[\000-\011\013\014\016-\040]'

    line_comment = r'(?://[^\r\n]*)'
    space_comment = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
    string1 = \
        r'(?:\047[^\047\\\r\n]*(?:\\(?:[^\r\n]|\r?\n|\r)[^\047\\\r\n]*)*\047)'
    string2 = r'(?:"[^"\\\r\n]*(?:\\(?:[^\r\n]|\r?\n|\r)[^"\\\r\n]*)*")'
    strings = r'(?:%s|%s)' % (string1, string2)

    charclass = r'(?:\[[^\\\]\r\n]*(?:\\[^\r\n][^\\\]\r\n]*)*\])'
    nospecial = r'[^/\\\[\r\n]'
    if extended:
        regex = r'(?:/(?![\r\n/*])%s*(?:(?:\\[^\r\n]|%s)%s*)*/)' % (
            nospecial, charclass, nospecial
        )
    else:
        regex = (
            r'(?:/(?:[^*/\\\r\n\[]|%s|\\[^\r\n])%s*(?:(?:\\[^\r\n]|%s)%s*)*/)'
        )
        regex = regex % (charclass, nospecial, charclass, nospecial)
    pre_regex = r'[(,=:\[!&|?{};\r\n]'

    space = r'(?:%s|%s)' % (space_chars, space_comment)
    newline = r'(?:%s?[\r\n])' % line_comment

    def fix_charclass(result):
        """ Fixup string of chars to fit into a regex char class """
        pos = result.find('-')
        if pos >= 0:
            result = r'%s%s-' % (result[:pos], result[pos + 1:])

        def sequentize(string):
            """
            Notate consecutive characters as sequence

            (1-4 instead of 1234)
            """
            first, last, result = None, None, []
            for char in map(ord, string):
                if last is None:
                    first = last = char
                elif last + 1 == char:
                    last = char
                else:
                    result.append((first, last))
                    first = last = char
            if last is not None:
                result.append((first, last))
            return ''.join(['%s%s%s' % (
                chr(first),
                last > first + 1 and '-' or '',
                last != first and chr(last) or ''
            ) for first, last in result])

        return _re.sub(r'([\000-\040\047])', # for better portability
            lambda m: '\\%03o' % ord(m.group(1)), (sequentize(result)
                .replace('\\', '\\\\')
                .replace('[', '\\[')
                .replace(']', '\\]')
            )
        )

    def id_literal_(what):
        """ Make id_literal like char class """
        match = _re.compile(what).match
        result = ''.join([
            chr(c) for c in xrange(127) if not match(chr(c))
        ])
        return '[^%s]' % fix_charclass(result)

    def not_id_literal_(keep):
        """ Make negated id_literal like char class """
        match = _re.compile(id_literal_(keep)).match
        result = ''.join([
            chr(c) for c in xrange(127) if not match(chr(c))
        ])
        return r'[%s]' % fix_charclass(result)

    if extended:
        id_literal = id_literal_(r'[a-zA-Z0-9_$]')
        id_literal_open = id_literal_(r'[a-zA-Z0-9_${\[(+-]')
        id_literal_close = id_literal_(r'[a-zA-Z0-9_$}\])"\047+-]')

        space_sub = _re.compile((
            r'([^\047"/\000-\040]+)'
            r'|(%(strings)s[^\047"/\000-\040]*)'
            r'|(?:(?<=%(pre_regex)s)%(space)s*(%(regex)s[^\047"/\000-\040]*))'
            r'|(?<=%(id_literal_close)s)'
                r'%(space)s*(?:(%(newline)s)%(space)s*)+'
                r'(?=%(id_literal_open)s)'
            r'|(?<=%(id_literal)s)(%(space)s)+(?=%(id_literal)s)'
            r'|%(space)s+'
            r'|(?:%(newline)s%(space)s*)+'
        ) % locals()).sub
        def space_subber(match):
            """ Substitution callback """
            # pylint: disable = C0321
            groups = match.groups()
            if groups[0]: return groups[0]
            elif groups[1]: return groups[1]
            elif groups[2]: return groups[2]
            elif groups[3]: return '\n'
            elif groups[4]: return ' '
            return ''

        def jsmin(script): # pylint: disable = W0621
            r"""
            Minify javascript based on `jsmin.c by Douglas Crockford`_\.

            Instead of parsing the stream char by char, it uses a regular
            expression approach which minifies the whole script with one big
            substitution regex.

            .. _jsmin.c by Douglas Crockford:
               http://www.crockford.com/javascript/jsmin.c

            :Parameters:
              `script` : ``str``
                Script to minify

            :Return: Minified script
            :Rtype: ``str``
            """
            return space_sub(space_subber, '\n%s\n' % script).strip()

    else:
        not_id_literal = not_id_literal_(r'[a-zA-Z0-9_$]')
        not_id_literal_open = not_id_literal_(r'[a-zA-Z0-9_${\[(+-]')
        not_id_literal_close = not_id_literal_(r'[a-zA-Z0-9_$}\])"\047+-]')

        space_norm_sub = _re.compile((
            r'(%(strings)s)'
            r'|(?:(%(pre_regex)s)%(space)s*(%(regex)s))'
            r'|(%(space)s)+'
            r'|(?:(%(newline)s)%(space)s*)+'
        ) % locals()).sub
        def space_norm_subber(match):
            """ Substitution callback """
            # pylint: disable = C0321
            groups = match.groups()
            if groups[0]: return groups[0]
            elif groups[1]: return groups[1].replace('\r', '\n') + groups[2]
            elif groups[3]: return ' '
            elif groups[4]: return '\n'

        space_sub1 = _re.compile((
            r'[\040\n]?(%(strings)s|%(pre_regex)s%(regex)s)'
            r'|\040(%(not_id_literal)s)'
            r'|\n(%(not_id_literal_open)s)'
        ) % locals()).sub
        def space_subber1(match):
            """ Substitution callback """
            groups = match.groups()
            return groups[0] or groups[1] or groups[2]

        space_sub2 = _re.compile((
            r'(%(strings)s)\040?'
            r'|(%(pre_regex)s%(regex)s)[\040\n]?'
            r'|(%(not_id_literal)s)\040'
            r'|(%(not_id_literal_close)s)\n'
        ) % locals()).sub
        def space_subber2(match):
            """ Substitution callback """
            groups = match.groups()
            return groups[0] or groups[1] or groups[2] or groups[3]

        def jsmin(script):
            r"""
            Minify javascript based on `jsmin.c by Douglas Crockford`_\.

            Instead of parsing the stream char by char, it uses a regular
            expression approach. The script is minified with three passes:

            normalization
                Control character are mapped to spaces, spaces and newlines
                are squeezed and comments are stripped.
            space removal 1
                Spaces before certain tokens are removed
            space removal 2
                Spaces after certain tokens are remove

            .. _jsmin.c by Douglas Crockford:
               http://www.crockford.com/javascript/jsmin.c

            :Parameters:
              `script` : ``str``
                Script to minify

            :Return: Minified script
            :Rtype: ``str``
            """
            return space_sub2(space_subber2,
                space_sub1(space_subber1,
                    space_norm_sub(space_norm_subber, '\n%s\n' % script)
                )
            ).strip()
    return jsmin

Example 80

Project: jcvi Source File: ahrd.py
Function: fix_text
def fix_text(s, ignore_sym_pat=False):

    if not ignore_sym_pat:
        # Fix descriptions like D7TDB1 (
        s = re.sub("([A-Z0-9]){6} \(", "", s)
        s = s.split(";")[0]

    # Fix parantheses containing names
    s = s.translate(None, "[]")
    s = s.replace("(-)", "[-]")
    s = s.replace("(+)", "[+]")
    s = s.replace("(Uncharacterized protein)", "")
    if not ignore_sym_pat:
        s = s.translate(None, "()")

    # fix minor typos, seen in `autonaming` output
    # change 'protei ' to 'protein '
    # change 'hypthetical' to 'hypothetical'
    # fix string starting with 'ytochrome'
    if 'protei ' in s: s = s.replace('protei ', 'protein ')
    if 'hypthetical' in s: s = s.replace('hypthetical', 'hypothetical')
    if s.startswith('ytochrome'): s = s.replace('ytochrome', 'cytochrome')

    # before trimming off at the first ";", check if name has glycosidic
    # linkage information (e.g 1,3 or 1,4). If so, also check if multiple
    # linkages are separated by ";". If so, replace ";" by "-"
    m = re.findall(glycosidic_link_pat, s)
    if m and ";" in s:
        s = re.sub(";\s*", "-", s)

    # remove underscore from description
    s = re.sub("_", " ", s)

    # Cellular locations
    # Any word that matches e.g. AT5G54690
    # Any word that matches e.g. Os02g0234800
    # (fragment)
    # UPF
    # Remove 'DDB_G\d+' ID
    # '_At[0-9]+g[0-9]+' to ''
    for pat in (loc_pat, osg_pat, frag_pat, upf_pat, ddb_pat):
        # below is a hack since word boundaries don't work on /
        s = s.strip() + " "
        s = re.sub(pat, "", s)

    # &apos;? => '
    s = re.sub(apos_pat, "'", s)
    # &gt => none
    s = re.sub(gt_pat, "", s)
    # reduce runs such as -- '''
    s = re.sub(r"[-]+", "-", s)
    s = re.sub(r"[']+", "'", s)

    s = s.strip()

    # -like to -like protein
    s = re.sub(like_pat, "-like protein", s)

    # 'repeat$' to 'repeat protein'
    if re.search(repeat_pat, s):
        s += "-containing protein"

    # 'binding$' to 'binding protein'
    if re.search(binding_pat, s):
        s += " protein"
        if re.match(Protein_pat, s):
            s = re.sub(Protein_pat, "", s)

    # 'domain$' to 'domain-containing protein'
    if re.search(domain_pat, s):
        s += "-containing protein"
        if re.search(r"-domain", s):
            s = re.sub(r"-domain", " domain", s)
        if re.match(Protein_pat, s):
            s = re.sub(Protein_pat, "", s)

    # 'related$' to '-like protein'
    if re.search(related_pat, s):
        s = re.sub(related_pat, "-like protein", s)
        if re.match(Protein_pat, s) and not re.match(r"Protein kinase", s):
            s = re.sub(Protein_pat, "", s)

    # '[0-9]+ humolog' to '-like protein'
    if re.search(humolog_pat1, s):
        s = re.sub(humolog_pat1, "-like protein", s)
        if re.match(Protein_pat, s):
            s = re.sub(Protein_pat, "", s)

    # 'Protein\s+(.*)\s+humolog' to '$1-like protein'
    match = re.search(humolog_pat2, s)
    if match and not re.match(r"Protein kinase", s):
        ret = match.group(1)
        s = re.sub(humolog_pat2, ret + "-like protein", s)
        s = re.sub(r"^\s+", "", s)
        s = s.capitalize()

    # 'humolog protein' to '-like protein'
    # 'humologue$' to '-like protein'
    # 'humolog$' to '-like protein'
    for pat in (humolog_pat3, humolog_pat5, humolog_pat6):
        if re.search(pat, s):
            s = re.sub(pat, "-like protein", s)

    # 'Agenet domain-containing protein / bromo-adjacent humology (BAH) domain-containing protein'
    # to 'Agenet and bromo-adjacent humology (BAH) domain-containing protein'
    if re.search(agenet_pat, s):
        s = re.sub(agenet_pat, "Agenet and ", s)

    # plural to singular
    if re.search(plural_pat, s):
        if (s.find('biogenesis') == -1 and s.find('Topors') == -1) or (not re.search(with_and_pat, s)):
            s = re.sub(r"s$", "", s)

    # 'like_TBP' or 'likeTBP' to 'like TBP'
    if re.search(tbp_pat, s):
        s = re.sub(tbp_pat, "like TBP", s)

    # 'protein protein' to 'protein'
    if re.search(prot_pat, s):
        s = re.sub(prot_pat, "protein", s)

    # 'dimerisation' to 'dimerization'
    if re.search(dimer_pat, s):
        s = re.sub(dimer_pat, "dimerization", s)

    # Any AHRD that matches e.g. "AT5G54690-like protein"
    # Any AHRD that contains the words '^Belongs|^Encoded|^Expression|^highly'
    for pat in (atg_pat, athila_pat1):
        if re.search(pat, s):
            s = Unknown

    # remove 'arabidopsis[ thaliana]' and/or embedded Atg IDs
    for pat in (atg_id_pat, athila_pat2, athila_pat3, athila_pat4):
        # below is a hack since word boundaries don't work on /
        s = s.strip() + " "
        s = re.sub(pat, "", s)

    # remove "\s+LENGTH=\d+" from TAIR deflines
    if re.search(length_pat, s):
        s = re.sub(length_pat, "", s)

    # if name has a dot followed by a space (". ") in it and contains multiple
    # parts separated by a comma, strip name starting from first occurrence of ","
    if re.search(r"\. ", s):
        if re.search(r",", s):
            s = s.split(",")[0]

    # if name contains any of the disallowed words,
    # remove word occurrence from name
    # if name contains references to any other organism, trim name upto
    # that occurrence
    for pat in (disallow_pat, organism_pat):
        if re.search(pat, s):
            s = re.sub(pat, "", s)

    s = s.strip()

    if not ignore_sym_pat:
        # 'humolog \d+' to '-like protein'
        if re.search(humolog_pat4, s):
            s = re.sub(humolog_pat4, "", s)

        # Trailing protein numeric copy (e.g. Myb 1)
        if re.search(trail_pat, s):
            s = re.sub(trail_pat, "", s)

        # if name is entirely a gene symbol-like (all capital letters, maybe followed by numbers)
        # add a "-like protein" at the end
        if (re.search(sym_pat, s) or re.search(lc_sym_pat, s)) \
                and not re.search(spada_pat, s):
            s = s + "-like protein"

        # if gene symbol in parantheses at EOL, remove symbol
        if re.search(eol_sym_pat, s):
            s = re.sub(eol_sym_pat, "", s)

        # if name terminates at a symbol([^A-Za-z0-9_]), trim it off
        if re.search(r"\W{1,}$", s) and not re.search(r"\)$", s):
            s = re.sub("\W{1,}$", "", s)

        if "uncharacterized" in s:
            s = "uncharacterized protein"

    # change sulfer to sulfur
    if re.search(sulfer_pat, s):
        s = re.sub(sulfer_pat, "sulfur", s)

    # change sulph to sulf
    if re.search(sulph_pat, s):
        s = re.sub(sulph_pat, "sulf", s)

    # change monoxy to monooxy
    if re.search(monoxy_pat, s):
        s = re.sub(monoxy_pat, "monooxy", s)

    # change proteine to protein
    if re.search(proteine_pat, s):
        s = re.sub(proteine_pat, "protein", s)

    # change signalling to signaling
    if re.search(signalling_pat, s):
        s = re.sub(signalling_pat, "signaling", s)

    # change aluminium to aluminum
    if re.search(aluminium_pat, s):
        s = re.sub(aluminium_pat, "aluminum", s)

    # change haem to heme
    if re.search(haem_pat, s):
        s = re.sub(haem_pat, "heme", s)

    # chage haemo to hemo
    if re.search(haemo_pat, s):
        s = re.sub(haemo_pat, "hemo", s)

    # change assessory to accessory
    if re.search(assessory_pat, s):
        s = re.sub(assessory_pat, "accessory", s)

    # change -ise/-ised/-isation to -ize/-ized/-ization
    match = re.search(ise_pat, s)
    if match:
        ret = match.group(1)
        if match.group(2):
            suff = match.group(2)
            s = re.sub(ise_pat, "{0}ize{1}".format(ret, suff), s)
        else:
            s = re.sub(ise_pat, "{0}ize".format(ret), s)

    match = re.search(isation_pat, s)
    if match:
        ret = match.group(1)
        s = re.sub(isation_pat, "{0}ization".format(ret), s)

    # change -bre to -ber
    match = re.search(bre_pat, s)
    if match:
        ret = match.group(1)
        s = re.sub(bre_pat, "{0}ber".format(ret), s)

    if not s.startswith(Hypothetical):
        # 'Candidate|Hypothetical|Novel|Predicted|Possible|Probable|Uncharacterized' to 'Putative'
        if s.startswith('Uncharacterized') and any(pat in s for pat in ('UCP', 'UPF', 'protein')):
            pass
        else:
            if re.search(put_pat, s):
                s = re.sub(put_pat, "Putative", s)

    """
    case (qr/^Histone-lysine/) { $ahrd =~ s/,\s+H\d{1}\s+lysine\-\d+//gs; }
    """
    sl = s.lower()

    # Any mention of `clone` or `contig` is not informative
    if "clone" in sl or "contig" in sl:
        s = Unknown

    # All that's left is `protein` is not informative
    if sl in ("protein", "protein, putative", ""):
        s = Unknown

    if Unknown.lower() in sl:
        s = Unknown

    if "FUNCTIONS IN".lower() in sl and "unknown" in sl:
        s = Unknown

    if "LOCATED IN".lower() in sl:
        s = Unknown

    s = re.sub(r"[,]*\s+putative$", "", s)

    if s == Unknown or s.strip() == "protein":
        s = Hypothetical

    # Compact all spaces
    s = ' '.join(s.split())

    assert s.strip()

    return s

Example 81

Project: cgat Source File: cgat2rdf.py
def processScript(script_name, outfile, options):
    '''process one script.'''

    # call other script
    dirname = os.path.dirname(script_name)
    basename = os.path.basename(script_name)[:-3]

    if options.src_dir:
        dirname = options.src_dir
        script_name = os.path.join(dirname, basename) + ".py"

    sys.path.insert(0, dirname)
    module = __import__(basename)

    E.Start = LocalStart
    E.info("loaded modules %s" % module)
    try:
        module.main(argv=["--help"])
    except DummyError:
        pass

    # get script's docstring
    docstring = module.__doc__

    # for k in dir(PARSER):
    #     print k, getattr(PARSER, k)
    # for option in PARSER.option_list:
    # print option, option.type, option.help, option._short_opts,
    # option._long_opts, option.default

    # @prefix clp: <http://www.humgen.nl/climate/ontologies/clp#> .
    # @prefix co: <http://www.isi.edu/ikcap/Wingse/componentOntology.owl#> .
    # @prefix dcterms: <http://purl.org/dc/terms/> .

    # n = Namespace("http://example.org/people/")
    g = Generator()

    data = collections.defaultdict(str)

    data['meta_title'] = 'Interface generator for CGAT scripts'
    data['meta_author'] = 'Andreas Heger'
    data['meta_version'] = 0.1

    data['name'] = basename
    data['interpreter'] = 'python'
    data['property_bag'] = {}
    data['description'] = getDescription(basename, docstring)
    data['help'] = docstring
    data['version'] = "1.0"
    data['owner'] = "CGAT"
    data['email'] = "[email protected]"
    data['binary'] = script_name

    # does not output multiple files
    data['multiple_output_files'] = False

    input_format, output_format = guessFormats(basename, docstring)

    stdin = {}
    stdin['name'] = 'input_file'
    stdin['ns_name'] = 'input_file'
    stdin['type'] = 'stdin'
    stdin['label'] = 'input file'
    stdin['description'] = 'input file'
    stdin['choices'] = None
    stdin['format'] = MAP_TYPE2FORMAT.get(input_format, input_format)
    stdin['rank'] = 1
    stdin['display'] = 'show'
    stdin['min_occurrence'] = 1
    stdin['max_occurrence'] = 1
    stdin['value'] = ""
    stdin['arg'] = "<"
    stdin['arg_long'] = ""
    stdin['property_bag'] = {}
    stdin['dependencies'] = {}

    stdout = {}
    stdout['name'] = 'tsvfile'
    stdout['ns_name'] = 'tsvfile'
    stdout['type'] = 'stdout'
    stdout['label'] = 'table'
    stdout['description'] = 'bam file'
    stdout['choices'] = None
    stdout['format'] = MAP_TYPE2FORMAT.get(output_format, output_format)
    stdout['rank'] = 1
    stdout['display'] = 'show'
    stdout['min_occurrence'] = 1
    stdout['max_occurrence'] = 1
    stdout['value'] = ""
    stdout['arg'] = ">"
    stdout['arg_long'] = ""
    stdout['property_bag'] = {}
    stdout['dependencies'] = {}

    outputs = [stdout]

    data['parameters'] = [stdin, stdout]

    defaults = PARSER.get_default_values()

    # flag to indicate wether script needs to go through cgat_wrapper.py
    use_wrapper = False

    for option in PARSER.option_list:
        # ignore options added by optparse
        if option.dest is None:
            continue

        # ignore benchmarking options
        if option.dest.startswith("timeit"):
            continue

        # ignore options related to forcing output
        if "force" in option.dest:
            continue

        # ignore some special options:
        # if option.dest in ("output_filename_pattern", ):
        #    continue

        # ignore output options
        if option.dest in ("stdin", "stdout", "stdlog", "stderr", "loglevel"):
            continue

        # remove default from help string
        option.help = re.sub("\[[^\]]*%default[^\]]*\]", "", option.help)

        param = buildParam()

        # get command line option call (long/short option)
        try:
            param['arg'] = option._short_opts[0]
        except IndexError:
            pass

        try:
            param['arg_long'] = option._long_opts[0]
        except IndexError:
            pass

        assert 'arg' in param or 'arg_long' in param

        # print "----------------------------------"
        # print [(x,getattr(option,x)) for x in dir( option )]

        param['name'] = option.dest
        param['ns_name'] = option.dest
        if option.type == "int":
            param['type'] = "integer"
        elif option.type == "float":
            param['type'] = "float"
        elif option.type == "string":
            param['type'] = "text"
            if option.metavar:
                mvar = option.metavar.lower()
                if mvar in MAP_TYPE2FORMAT:
                    param['format'] = MAP_TYPE2FORMAT[mvar]
                    param['type'] = "data"
                if mvar == "bam":
                    use_wrapper = True
                    data['parameters'].append(buildParam(
                        name='wrapper_bam_file',
                        ns_name='wrapper_bam_file',
                        arg_long='--wrapper-bam-file',
                        label=option.dest,
                        type='data',
                        format='bam',
                        help=option.help,
                        value=getattr(defaults,  option.dest)))

                    data['parameters'].append(buildParam(
                        name='wrapper_bam_index',
                        ns_name='wrapper_bam_index',
                        arg_long='--wrapper-bai-file',
                        type='data',
                        value='${wrapper_bam_file.metadata.bam_index}',
                        display='hidden'))

                    # use long argument
                    data['parameters'].append(buildParam(
                        name='wrapper_bam_option',
                        ns_name='wrapper_bam_option',
                        arg_long='--wrapper-bam-option',
                        value=param[
                            'arg_long'],
                        display='hidden'))

                    continue

        elif option.type == "choice":
            param['type'] = "select"
            param['choices'] = option.choices
            if option.action == "append":
                param['multiple'] = True
        elif option.action.startswith("store"):
            param['type'] = "boolean"
        else:
            raise ValueError("unknown type for %s" % str(option))

        param['label'] = option.dest
        param['description'] = option.help
        param['rank'] = 1
        param['display'] = 'show'
        param['min_occurrence'] = 0
        param['max_occurrence'] = 1

        # get default value
        param['value'] = getattr(defaults,  option.dest)

        param['dependencies'] = {}
        param['property_bag'] = {}

        if option.dest == "genome_file":
            param['property_bag'] = {'from_loc': 'path',
                                     'loc_id': 'sam_fa',
                                     'loc_id_filter': '1'}

        # deal with multiple output files:
        if option.dest == "output_filename_pattern":
            use_wrapper = True
            data['parameters'].append(buildParam(
                name='wrapper_html_file',
                ns_name='wrapper_html_file',
                arg_long='--wrapper-html-file',
                value='$html_file',
                display='hidden'))

            data['parameters'].append(buildParam(
                name='wrapper_html_dir',
                ns_name='wrapper_html_dir',
                arg_long='--wrapper-html-dir',
                value='$html_file.files_path',
                display='hidden'))

            outputs.append(buildParam(name='html_file',
                                      ns_name='html_file',
                                      format='html',
                                      label='html'),
                           )
            continue

        data['parameters'].append(param)

    if options.output_format == "rdf":
        outfile.write(g.serialize(data, format='turtle') + "\n")

    elif options.output_format == "galaxy":

        if use_wrapper:

            # add hidden option for wrapper
            param = buildParam(
                name='wrapper-command',
                ns_name='wrapper-command',
                display='hidden',
                type='text',
                value=data['binary'],
                label='wrapper',
                description='wrapper',
                arg_long="--wrapper-command")

            data['parameters'].append(param)

            # point to wrapper
            data['binary'] = os.path.join(dirname, "cgat_galaxy_wrapper.py")

        displayMap = collections.defaultdict(list)

        for param in data['parameters']:
            displayMap[param['display']].append(param)

        displayMap['normal'] = displayMap['show']

        target = Template(
           IOTools.openFile('/ifs/devel/andreas/cgat/scripts/cgat2rdf/galaxy.xml').read())
        outfile.write(target.render(data=data,
                                    displayMap=displayMap,
                                    outputs=outputs) + "\n")

Example 82

Project: termite-data-server Source File: ttfonts.py
    def extractInfo(self): 
        #################/
        # name - Naming table
        #################/
        self.sFamilyClass = 0
        self.sFamilySubClass = 0

        name_offset = self.seek_table("name")
        format = self.read_ushort()
        if (format != 0):
            die("Unknown name table format " + format)
        numRecords = self.read_ushort()
        string_data_offset = name_offset + self.read_ushort()
        names = {1:'',2:'',3:'',4:'',6:''}
        K = names.keys()
        nameCount = len(names)
        for i in range(numRecords): 
            platformId = self.read_ushort()
            encodingId = self.read_ushort()
            languageId = self.read_ushort()
            nameId = self.read_ushort()
            length = self.read_ushort()
            offset = self.read_ushort()
            if (nameId not in K): continue
            N = ''
            if (platformId == 3 and encodingId == 1 and languageId == 0x409):  # Microsoft, Unicode, US English, PS Name
                opos = self._pos
                self.seek(string_data_offset + offset)
                if (length % 2 != 0):
                    die("PostScript name is UTF-16BE string of odd length")
                length /= 2
                N = ''
                while (length > 0):
                    char = self.read_ushort()
                    N += (chr(char))
                    length -= 1
                self._pos = opos
                self.seek(opos)
            
            elif (platformId == 1 and encodingId == 0 and languageId == 0):  # Macintosh, Roman, English, PS Name
                opos = self._pos
                N = self.get_chunk(string_data_offset + offset, length)
                self._pos = opos
                self.seek(opos)
            
            if (N and names[nameId]==''):
                names[nameId] = N
                nameCount -= 1
                if (nameCount==0): break
            
        
        if (names[6]):
            psName = names[6]
        elif (names[4]):
            psName = re.sub(' ','-',names[4])
        elif (names[1]):
            psName = re.sub(' ','-',names[1])
        else:
            psName = ''
        if (not psName):
            die("Could not find PostScript font name")
        self.name = psName
        if (names[1]):
            self.familyName = names[1]  
        else:  
            self.familyName = psName 
        if (names[2]):
            self.styleName = names[2]
        else:
            self.styleName = 'Regular' 
        if (names[4]):
            self.fullName = names[4]
        else:
            self.fullName = psName 
        if (names[3]):
            self.uniqueFontID = names[3]
        else:
            self.uniqueFontID = psName 
        if (names[6]):
            self.fullName = names[6] 

        #################/
        # head - Font header table
        #################/
        self.seek_table("head")
        self.skip(18) 
        self.unitsPerEm = unitsPerEm = self.read_ushort()
        scale = 1000 / float(unitsPerEm)
        self.skip(16)
        xMin = self.read_short()
        yMin = self.read_short()
        xMax = self.read_short()
        yMax = self.read_short()
        self.bbox = [(xMin*scale), (yMin*scale), (xMax*scale), (yMax*scale)]
        self.skip(3*2)
        indexToLocFormat = self.read_ushort()
        glyphDataFormat = self.read_ushort()
        if (glyphDataFormat != 0):
            die('Unknown glyph data format ' + glyphDataFormat)

        #################/
        # hhea metrics table
        #################/
        # ttf2t1 seems to use this value rather than the one in OS/2 - so put in for compatibility
        if ("hhea" in self.tables):
            self.seek_table("hhea")
            self.skip(4)
            hheaAscender = self.read_short()
            hheaDescender = self.read_short()
            self.ascent = (hheaAscender *scale)
            self.descent = (hheaDescender *scale)
        

        #################/
        # OS/2 - OS/2 and Windows metrics table
        #################/
        if ("OS/2" in self.tables): 
            self.seek_table("OS/2")
            version = self.read_ushort()
            self.skip(2)
            usWeightClass = self.read_ushort()
            self.skip(2)
            fsType = self.read_ushort()
            if (fsType == 0x0002 or (fsType & 0x0300) != 0): 
                die('ERROR - Font file ' + self.filename + ' cannot be embedded due to copyright restrictions.')
                self.restrictedUse = True
            
            self.skip(20)
            sF = self.read_short()
            self.sFamilyClass = (sF >> 8)
            self.sFamilySubClass = (sF & 0xFF)
            self._pos += 10  #PANOSE = 10 byte length
            panose = self.fh.read(10)
            self.skip(26)
            sTypoAscender = self.read_short()
            sTypoDescender = self.read_short()
            if (not self.ascent): 
                self.ascent = (sTypoAscender*scale)
            if (not self.descent): 
                self.descent = (sTypoDescender*scale)
            if (version > 1):
                self.skip(16)
                sCapHeight = self.read_short()
                self.capHeight = (sCapHeight*scale)
            else:
                self.capHeight = self.ascent            
        
        else:
            usWeightClass = 500
            if (not self.ascent): self.ascent = (yMax*scale)
            if (not self.descent): self.descent = (yMin*scale)
            self.capHeight = self.ascent
        
        self.stemV = 50 + int(pow((usWeightClass / 65.0),2))

        #################/
        # post - PostScript table
        #################/
        self.seek_table("post")
        self.skip(4) 
        self.italicAngle = self.read_short() + self.read_ushort() / 65536.0
        self.underlinePosition = self.read_short() * scale
        self.underlineThickness = self.read_short() * scale
        isFixedPitch = self.read_ulong()

        self.flags = 4

        if (self.italicAngle!= 0):
            self.flags = self.flags | 64
        if (usWeightClass >= 600):
            self.flags = self.flags | 262144
        if (isFixedPitch):
            self.flags = self.flags | 1

        #################/
        # hhea - Horizontal header table
        #################/
        self.seek_table("hhea")
        self.skip(32) 
        metricDataFormat = self.read_ushort()
        if (metricDataFormat != 0):
            die('Unknown horizontal metric data format '.metricDataFormat)
        numberOfHMetrics = self.read_ushort()
        if (numberOfHMetrics == 0):
            die('Number of horizontal metrics is 0')

        #################/
        # maxp - Maximum profile table
        #################/
        self.seek_table("maxp")
        self.skip(4)
        numGlyphs = self.read_ushort()

        #################/
        # cmap - Character to glyph index mapping table
        #################/
        cmap_offset = self.seek_table("cmap")
        self.skip(2)
        cmapTableCount = self.read_ushort()
        unicode_cmap_offset = 0
        unicode_cmap_offset12 = 0
        
        for i in range(cmapTableCount):
            platformID = self.read_ushort()
            encodingID = self.read_ushort()
            offset = self.read_ulong()
            save_pos = self._pos
            if platformID == 3 and encodingID == 10:  # Microsoft, UCS-4
                format = self.get_ushort(cmap_offset + offset)
                if (format == 12):
                    if not unicode_cmap_offset12:
                        unicode_cmap_offset12 = cmap_offset + offset
                    break
            if ((platformID == 3 and encodingID == 1) or platformID == 0):  # Microsoft, Unicode
                format = self.get_ushort(cmap_offset + offset)
                if (format == 4):
                    if (not unicode_cmap_offset):
                        unicode_cmap_offset = cmap_offset + offset
                    break
                    
            self.seek(save_pos)
        
        if not unicode_cmap_offset and not unicode_cmap_offset12:
            die('Font (' + self.filename + ') does not have cmap for Unicode (platform 3, encoding 1, format 4, or platform 3, encoding 10, format 12, or platform 0, any encoding, format 4)')

        glyphToChar = {}
        charToGlyph = {}
        if unicode_cmap_offset12:
            self.getCMAP12(unicode_cmap_offset12, glyphToChar, charToGlyph)
        else:    
            self.getCMAP4(unicode_cmap_offset, glyphToChar, charToGlyph)

        #################/
        # hmtx - Horizontal metrics table
        #################/
        self.getHMTX(numberOfHMetrics, numGlyphs, glyphToChar, scale)

Example 83

Project: cgstudiomap Source File: escpos.py
    def receipt(self,xml):
        """
        Prints an xml based receipt definition
        """

        def strclean(string):
            if not string:
                string = ''
            string = string.strip()
            string = re.sub('\s+',' ',string)
            return string

        def format_value(value, decimals=3, width=0, decimals_separator='.', thousands_separator=',', autoint=False, symbol='', position='after'):
            decimals = max(0,int(decimals))
            width    = max(0,int(width))
            value    = float(value)

            if autoint and math.floor(value) == value:
                decimals = 0
            if width == 0:
                width = ''

            if thousands_separator:
                formatstr = "{:"+str(width)+",."+str(decimals)+"f}"
            else:
                formatstr = "{:"+str(width)+"."+str(decimals)+"f}"


            ret = formatstr.format(value)
            ret = ret.replace(',','COMMA')
            ret = ret.replace('.','DOT')
            ret = ret.replace('COMMA',thousands_separator)
            ret = ret.replace('DOT',decimals_separator)

            if symbol:
                if position == 'after':
                    ret = ret + symbol
                else:
                    ret = symbol + ret
            return ret

        def print_elem(stylestack, serializer, elem, indent=0):

            elem_styles = {
                'h1': {'bold': 'on', 'size':'double'},
                'h2': {'size':'double'},
                'h3': {'bold': 'on', 'size':'double-height'},
                'h4': {'size': 'double-height'},
                'h5': {'bold': 'on'},
                'em': {'font': 'b'},
                'b':  {'bold': 'on'},
            }

            stylestack.push()
            if elem.tag in elem_styles:
                stylestack.set(elem_styles[elem.tag])
            stylestack.set(elem.attrib)

            if elem.tag in ('p','div','section','article','receipt','header','footer','li','h1','h2','h3','h4','h5'):
                serializer.start_block(stylestack)
                serializer.text(elem.text)
                for child in elem:
                    print_elem(stylestack,serializer,child)
                    serializer.start_inline(stylestack)
                    serializer.text(child.tail)
                    serializer.end_entity()
                serializer.end_entity()

            elif elem.tag in ('span','em','b','left','right'):
                serializer.start_inline(stylestack)
                serializer.text(elem.text)
                for child in elem:
                    print_elem(stylestack,serializer,child)
                    serializer.start_inline(stylestack)
                    serializer.text(child.tail)
                    serializer.end_entity()
                serializer.end_entity()

            elif elem.tag == 'value':
                serializer.start_inline(stylestack)
                serializer.pre(format_value( 
                                              elem.text,
                                              decimals=stylestack.get('value-decimals'),
                                              width=stylestack.get('value-width'),
                                              decimals_separator=stylestack.get('value-decimals-separator'),
                                              thousands_separator=stylestack.get('value-thousands-separator'),
                                              autoint=(stylestack.get('value-autoint') == 'on'),
                                              symbol=stylestack.get('value-symbol'),
                                              position=stylestack.get('value-symbol-position') 
                                            ))
                serializer.end_entity()

            elif elem.tag == 'line':
                width = stylestack.get('width')
                if stylestack.get('size') in ('double', 'double-width'):
                    width = width / 2

                lineserializer = XmlLineSerializer(stylestack.get('indent')+indent,stylestack.get('tabwidth'),width,stylestack.get('line-ratio'))
                serializer.start_block(stylestack)
                for child in elem:
                    if child.tag == 'left':
                        print_elem(stylestack,lineserializer,child,indent=indent)
                    elif child.tag == 'right':
                        lineserializer.start_right()
                        print_elem(stylestack,lineserializer,child,indent=indent)
                serializer.pre(lineserializer.get_line())
                serializer.end_entity()

            elif elem.tag == 'ul':
                serializer.start_block(stylestack)
                bullet = stylestack.get('bullet')
                for child in elem:
                    if child.tag == 'li':
                        serializer.style(stylestack)
                        serializer.raw(' ' * indent * stylestack.get('tabwidth') + bullet)
                    print_elem(stylestack,serializer,child,indent=indent+1)
                serializer.end_entity()

            elif elem.tag == 'ol':
                cwidth = len(str(len(elem))) + 2
                i = 1
                serializer.start_block(stylestack)
                for child in elem:
                    if child.tag == 'li':
                        serializer.style(stylestack)
                        serializer.raw(' ' * indent * stylestack.get('tabwidth') + ' ' + (str(i)+')').ljust(cwidth))
                        i = i + 1
                    print_elem(stylestack,serializer,child,indent=indent+1)
                serializer.end_entity()

            elif elem.tag == 'pre':
                serializer.start_block(stylestack)
                serializer.pre(elem.text)
                serializer.end_entity()

            elif elem.tag == 'hr':
                width = stylestack.get('width')
                if stylestack.get('size') in ('double', 'double-width'):
                    width = width / 2
                serializer.start_block(stylestack)
                serializer.text('-'*width)
                serializer.end_entity()

            elif elem.tag == 'br':
                serializer.linebreak()

            elif elem.tag == 'img':
                if 'src' in elem.attrib and 'data:' in elem.attrib['src']:
                    self.print_base64_image(elem.attrib['src'])

            elif elem.tag == 'barcode' and 'encoding' in elem.attrib:
                serializer.start_block(stylestack)
                self.barcode(strclean(elem.text),elem.attrib['encoding'])
                serializer.end_entity()

            elif elem.tag == 'cut':
                self.cut()
            elif elem.tag == 'partialcut':
                self.cut(mode='part')
            elif elem.tag == 'cashdraw':
                self.cashdraw(2)
                self.cashdraw(5)

            stylestack.pop()

        try:
            stylestack      = StyleStack() 
            serializer      = XmlSerializer(self)
            root            = ET.fromstring(xml.encode('utf-8'))

            self._raw(stylestack.to_escpos())

            print_elem(stylestack,serializer,root)

            if 'open-cashdrawer' in root.attrib and root.attrib['open-cashdrawer'] == 'true':
                self.cashdraw(2)
                self.cashdraw(5)
            if not 'cut' in root.attrib or root.attrib['cut'] == 'true' :
                self.cut()

        except Exception as e:
            errmsg = str(e)+'\n'+'-'*48+'\n'+traceback.format_exc() + '-'*48+'\n'
            self.text(errmsg)
            self.cut()

            raise e

Example 84

Project: SickRage Source File: parser.py
    def _parse_string(self, name):  # pylint: disable=too-many-locals, too-many-branches, too-many-statements
        if not name:
            return

        matches = []
        bestResult = None

        for (cur_regex_num, cur_regex_name, cur_regex) in self.compiled_regexes:
            match = cur_regex.match(name)

            if not match:
                continue

            result = ParseResult(name)
            result.which_regex = [cur_regex_name]
            result.score = 0 - cur_regex_num

            named_groups = match.groupdict().keys()

            if 'series_name' in named_groups:
                result.series_name = match.group('series_name')
                if result.series_name:
                    result.series_name = self.clean_series_name(result.series_name)
                    result.score += 1

            if 'series_num' in named_groups and match.group('series_num'):
                result.score += 1

            if 'season_num' in named_groups:
                tmp_season = int(match.group('season_num'))
                if cur_regex_name == 'bare' and tmp_season in (19, 20):
                    continue
                result.season_number = tmp_season
                result.score += 1

            if 'ep_num' in named_groups:
                ep_num = self._convert_number(match.group('ep_num'))
                if 'extra_ep_num' in named_groups and match.group('extra_ep_num'):
                    result.episode_numbers = range(ep_num, self._convert_number(match.group('extra_ep_num')) + 1)
                    result.score += 1
                else:
                    result.episode_numbers = [ep_num]
                result.score += 3

            if 'ep_ab_num' in named_groups:
                ep_ab_num = self._convert_number(match.group('ep_ab_num'))
                if 'extra_ab_ep_num' in named_groups and match.group('extra_ab_ep_num'):
                    result.ab_episode_numbers = range(ep_ab_num,
                                                      self._convert_number(match.group('extra_ab_ep_num')) + 1)
                    result.score += 1
                else:
                    result.ab_episode_numbers = [ep_ab_num]
                result.score += 1

            if 'air_date' in named_groups:
                air_date = match.group('air_date')
                try:
                    assert re.sub(r'[^\d]*', '', air_date) != '112263'
                    result.air_date = dateutil.parser.parse(air_date, fuzzy_with_tokens=True)[0].date()
                    result.score += 1
                except Exception:
                    continue

            if 'extra_info' in named_groups:
                tmp_extra_info = match.group('extra_info')

                # Show.S04.Special or Show.S05.Part.2.Extras is almost certainly not every episode in the season
                if tmp_extra_info and cur_regex_name == 'season_only' and re.search(
                        r'([. _-]|^)(special|extra)s?\w*([. _-]|$)', tmp_extra_info, re.I):
                    continue
                result.extra_info = tmp_extra_info
                result.score += 1

            if 'release_group' in named_groups:
                result.release_group = match.group('release_group')
                result.score += 1

            if 'version' in named_groups:
                # assigns version to anime file if detected using anime regex. Non-anime regex receives -1
                version = match.group('version')
                if version:
                    result.version = version
                else:
                    result.version = 1
            else:
                result.version = -1

            matches.append(result)

        if matches:
            # pick best match with highest score based on placement
            bestResult = max(sorted(matches, reverse=True, key=lambda x: x.which_regex), key=lambda x: x.score)

            show = None
            if not self.naming_pattern:
                # try and create a show object for this result
                show = helpers.get_show(bestResult.series_name, self.tryIndexers)

            # confirm passed in show object indexer id matches result show object indexer id
            if show:
                if self.showObj and show.indexerid != self.showObj.indexerid:
                    show = None
                bestResult.show = show
            elif not show and self.showObj:
                bestResult.show = self.showObj

            # if this is a naming pattern test or result doesn't have a show object then return best result
            if not bestResult.show or self.naming_pattern:
                return bestResult

            # get quality
            bestResult.quality = common.Quality.nameQuality(name, bestResult.show.is_anime)

            new_episode_numbers = []
            new_season_numbers = []
            new_absolute_numbers = []

            # if we have an air-by-date show then get the real season/episode numbers
            if bestResult.is_air_by_date:
                airdate = bestResult.air_date.toordinal()
                main_db_con = db.DBConnection()
                sql_result = main_db_con.select(
                    "SELECT season, episode FROM tv_episodes WHERE showid = ? and indexer = ? and airdate = ?",
                    [bestResult.show.indexerid, bestResult.show.indexer, airdate])

                season_number = None
                episode_numbers = []

                if sql_result:
                    season_number = int(sql_result[0][0])
                    episode_numbers = [int(sql_result[0][1])]

                if season_number is None or not episode_numbers:
                    try:
                        lINDEXER_API_PARMS = sickbeard.indexerApi(bestResult.show.indexer).api_params.copy()

                        lINDEXER_API_PARMS['language'] = bestResult.show.lang or sickbeard.INDEXER_DEFAULT_LANGUAGE

                        t = sickbeard.indexerApi(bestResult.show.indexer).indexer(**lINDEXER_API_PARMS)

                        epObj = t[bestResult.show.indexerid].airedOn(bestResult.air_date)[0]

                        season_number = int(epObj["seasonnumber"])
                        episode_numbers = [int(epObj["episodenumber"])]
                    except sickbeard.indexer_episodenotfound:
                        logger.log(u"Unable to find episode with date " + str(bestResult.air_date) + " for show " + bestResult.show.name + ", skipping", logger.WARNING)
                        episode_numbers = []
                    except sickbeard.indexer_error as e:
                        logger.log(u"Unable to contact " + sickbeard.indexerApi(bestResult.show.indexer).name + ": " + ex(e), logger.WARNING)
                        episode_numbers = []

                for epNo in episode_numbers:
                    s = season_number
                    e = epNo

                    if bestResult.show.is_scene:
                        (s, e) = scene_numbering.get_indexer_numbering(bestResult.show.indexerid,
                                                                       bestResult.show.indexer,
                                                                       season_number,
                                                                       epNo)
                    new_episode_numbers.append(e)
                    new_season_numbers.append(s)

            elif bestResult.show.is_anime and bestResult.ab_episode_numbers:
                scene_season = scene_exceptions.get_scene_exception_by_name(bestResult.series_name)[1]
                for epAbsNo in bestResult.ab_episode_numbers:
                    a = epAbsNo

                    if bestResult.show.is_scene:
                        a = scene_numbering.get_indexer_absolute_numbering(bestResult.show.indexerid,
                                                                           bestResult.show.indexer, epAbsNo,
                                                                           True, scene_season)

                    (s, e) = helpers.get_all_episodes_from_absolute_number(bestResult.show, [a])

                    new_absolute_numbers.append(a)
                    new_episode_numbers.extend(e)
                    new_season_numbers.append(s)

            elif bestResult.season_number and bestResult.episode_numbers:
                for epNo in bestResult.episode_numbers:
                    s = bestResult.season_number
                    e = epNo

                    if bestResult.show.is_scene:
                        (s, e) = scene_numbering.get_indexer_numbering(bestResult.show.indexerid,
                                                                       bestResult.show.indexer,
                                                                       bestResult.season_number,
                                                                       epNo)
                    if bestResult.show.is_anime:
                        a = helpers.get_absolute_number_from_season_and_episode(bestResult.show, s, e)
                        if a:
                            new_absolute_numbers.append(a)

                    new_episode_numbers.append(e)
                    new_season_numbers.append(s)

            # need to do a quick sanity check heregex.  It's possible that we now have episodes
            # from more than one season (by tvdb numbering), and this is just too much
            # for sickbeard, so we'd need to flag it.
            new_season_numbers = list(set(new_season_numbers))  # remove duplicates
            if len(new_season_numbers) > 1:
                raise InvalidNameException("Scene numbering results episodes from "
                                           "seasons %s, (i.e. more than one) and "
                                           "sickrage does not support this.  "
                                           "Sorry." % (str(new_season_numbers)))

            # I guess it's possible that we'd have duplicate episodes too, so lets
            # eliminate them
            new_episode_numbers = list(set(new_episode_numbers))
            new_episode_numbers.sort()

            # maybe even duplicate absolute numbers so why not do them as well
            new_absolute_numbers = list(set(new_absolute_numbers))
            new_absolute_numbers.sort()

            if new_absolute_numbers:
                bestResult.ab_episode_numbers = new_absolute_numbers

            if new_season_numbers and new_episode_numbers:
                bestResult.episode_numbers = new_episode_numbers
                bestResult.season_number = new_season_numbers[0]

            if bestResult.show.is_scene:
                logger.log(
                    u"Converted parsed result " + bestResult.original_name + " into " + str(bestResult).decode('utf-8',
                                                                                                               'xmlcharrefreplace'),
                    logger.DEBUG)

        # CPU sleep
        time.sleep(0.02)

        return bestResult

Example 85

Project: txtorg Source File: nielsenstemmer.py
Function: remove_stop_words
def removeStopWords (texts):

    # Split up the words...
    texts_split = texts.split(" ")

    preps = [
        u'\u0641\u064a',  #fy
        u'\u0641\u064a\u0647',  #fyh
        u'\u0641\u064a\u0647\u0627',  #fyha
        u'\u0641\u064a\u0647\u0645',  #fyhm
        u'\u0639\u0644\u0649',  #3lA
        u'\u0639\u0644\u064a\u0643',  #3lyk
        u'\u0639\u0644\u064a\u0647',  #3lyh
        u'\u0639\u0644\u064a\u0647\u0627',  #3lyha
        u'\u0639\u0644\u064a\u0647\u0645',  #3lyhm
        u'\u0639\u0644\u064a',  #3ly
        u'\u0628\u0647',  #bh
        u'\u0628\u0647\u0627',  #bha
        u'\u0628\u0647\u0645',  #bhm
        u'\u0644\u0647',  #lh
        u'\u0644\u0647\u0627',  #lha
        u'\u0644\u0647\u0645',  #lhm
        u'\u0645\u0639',  #m3
        u'\u0645\u0639\u0647',  #m3h
        u'\u0645\u0639\u0647\u0627',  #m3ha
        u'\u0645\u0639\u0647\u0645',  #m3hm
        u'\u0639\u0646',  #3n
        u'\u0639\u0646\u0647',  #3nh
        u'\u0639\u0646\u0647\u0627',  #3nha
        u'\u0639\u0646\u0647\u0645',  #3nhm
        u'\u062a\u062d\u062a',  #t7t
        u'\u062d\u062a\u0649',  #7tA
        u'\u0641\u0648\u0642',  #fwQ
        u'\u0641\u0648\u0642\u064e',  #fwQ?
        u'\u0628\u062c\u0627\u0646\u0628',  #bjanb
        u'\u0623\u0645\u0627\u0645',  #amam
        u'\u0623\u0645\u0627\u0645\u064e',  #amam?
        u'\u0627\u0645\u0627\u0645',  #amam
        u'\u062e\u0627\u0631\u062c',  #Karj
        u'\u0628\u0627\u0644\u062e\u0627\u0631\u062c',  #balKarj
        u'\u062d\u0648\u0644\u064e',  #7wl?
        u'\u062d\u0648\u0644',  #7wl
        u'\u0631\u063a\u0645',  #rGm
        u'\u0628\u0627\u0644\u0631\u063a\u0645',  #balrGm
        u'\u0631\u063a\u0645\u064e',  #rGm?
        u'\u0645\u0646\u0630',  #mni
        u'\u0645\u0646\u0630\u064f',  #mni?
        u'\u0645\u0646',  #mn
        u'\u062e\u0644\u0627\u0644',  #Klal
        u'\u062e\u0644\u0627\u0644\u064e',  #Klal?
        u'\u062d\u0648\u0644',  #7wl
        u'\u062d\u0648\u0644\u064e',  #7wl?
        u'\u0642\u0628\u0644',  #Qbl
        u'\u0642\u0628\u0644\u064e',  #Qbl?
        u'\u0648\u0641\u0642\u0627',  #wfQa
        u'\u0625\u0644\u0649',  #alA
        u'\u0627\u0644\u0649\u0648\u0631\u0627\u0621\u064e',  #alAwraq?
        u'\u0648\u0631\u0627\u0621',  #wraq
        u'\u0628\u064a\u0646\u064e',  #byn?
        u'\u0628\u064a\u0646',  #byn
        u'\u0628\u062f\u0648\u0646',  #bdwn
        u'\u0644\u0643\u0646',  #lkn
        u'\u0628\u0627\u062a\u062c\u0627\u0647',  #batjah
        u'\u0623\u0642\u0644',  #aQl
        u'\u0627\u0642\u0644',  #aQl
        u'\u0627\u0643\u062b\u0631'  #akUr
        ]  

    pronouns = [
        u'\u0647\u0630\u0627',  #hia
        u'\u0647\u0630\u0647',  #hih
        u'\u0630\u0644\u0643',  #ilk
        u'\u062a\u0644\u0643',  #tlk
        u'\u0647\u0624\u0644\u064e\u0627\u0621',  #hol?aq
        u'\u0647\u0624\u0644\u0627\u0621',  #holaq
        u'\u0627\u0648\u0644\u0627\u0626\u0643',  #awla5k
        u'\u0647\u0630\u0627\u0646',  #hian
        u'\u0647\u0630\u064a\u0646\u0647\u062a\u0627\u0646',  #hiynhtan
        u'\u0647\u062a\u064a\u0646\u0623\u0646\u0627',  #htynana
        u'\u0627\u0646\u0627',  #ana
        u'\u0623\u0646\u062a',  #ant
        u'\u0647\u0645\u0627',  #hma
        u'\u0623\u0646\u062a\u064e',  #ant?
        u'\u0627\u0646\u062a',  #ant
        u'\u0623\u0646\u062a',  #ant
        u'\u0623\u0646\u062a\u0650',  #ant?
        u'\u0627\u0646\u062a\u0647\u0648',  #anthw
        u'\u0647\u0648\u064e',  #hw?
        u'\u0647\u0648',  #hw
        u'\u0647\u064a',  #hy
        u'\u0647\u064a\u064e',  #hy?
        u'\u0646\u062d\u0646',  #n7n
        u'\u0623\u0646\u062a\u0645',  #antm
        u'\u0627\u0646\u062a\u0645',  #antm
        u'\u0623\u0646\u062a\u0645',  #antm
        u'\u0627\u0646\u062a\u0645',  #antm
        u'\u0647\u064f\u0645',  #h?m
        u'\u0647\u0645',  #hm
        u'\u0644\u0647\u0645',  #lhm
        u'\u0645\u0646\u0647\u0645',  #mnhm
        u'\u0648\u0647\u0645',  #whm
        u'\u0627\u0644\u062a\u064a',  #alty
        u'\u0627\u0644\u0630\u064a',  #aliy
        u'\u0627\u0644\u0644\u0630\u0627\u0646',  #allian
        u'\u0627\u0644\u0644\u0630\u064a\u0646',  #alliyn
        u'\u0627\u0644\u0644\u062a\u0627\u0646',  #alltan
        u'\u0627\u0644\u0644\u062a\u064a\u0646'  #alltyn
        ]

    particles = [
        u'\u0627\u0646',  #an
        u'\u0648\u0627\u0646',  #wan
        u'\u0625\u0646',  #an
        u'\u0625\u0646\u0647',  #anh
        u'\u0625\u0646\u0647\u0627',  #anha
        u'\u0625\u0646\u0647\u0645',  #anhm
        u'\u0625\u0646\u0647\u0645\u0627',  #anhma
        u'\u0625\u0646\u064a',  #any
        u'\u0648\u0625\u0646',  #wan
        u'\u0648\u0623\u0646',  #wan
        u'\u0627\u0646',  #an
        u'\u0627\u0646\u0647',  #anh
        u'\u0627\u0646\u0647\u0627',  #anha
        u'\u0627\u0646\u0647\u0645',  #anhm
        u'\u0627\u0646\u0647\u0645\u0627',  #anhma
        u'\u0627\u0646\u064a',  #any
        u'\u0648\u0627\u0646',  #wan
        u'\u0648\u0627\u0646',  #wan
        u'\u0623\u0646',  #an
        u'\u0627\u0646',  #an
        u'\u0623\u0644\u0627',  #ala
        u'\u0628\u0623\u0646',  #ban
        u'\u0627\u0646',  #an
        u'\u0627\u0644\u0627',  #ala
        u'\u0628\u0627\u0646',  #ban
        u'\u0623\u0646\u0647',  #anh
        u'\u0623\u0646\u0647\u0627',  #anha
        u'\u0623\u0646\u0647\u0645',  #anhm
        u'\u0623\u0646\u0647\u0645\u0627',  #anhma
        u'\u0627\u0646\u0647',  #anh
        u'\u0627\u0646\u0647\u0627',  #anha
        u'\u0627\u0646\u0647\u0645',  #anhm
        u'\u0627\u0646\u0647\u0645\u0627',  #anhma
        u'\u0623\u0630',  #ai
        u'\u0627\u0630',  #ai
        u'\u0627\u0630\u0627',  #aia
        u'\u0625\u0630',  #ai
        u'\u0625\u0630\u0627',  #aia
        u'\u0648\u0625\u0630',  #wai
        u'\u0648\u0625\u0630\u0627',  #waia
        u'\u0627\u0630',  #ai
        u'\u0627\u0630',  #ai
        u'\u0627\u0630\u0627',  #aia
        u'\u0627\u0630',  #ai
        u'\u0627\u0630\u0627',  #aia
        u'\u0641\u0627\u0630\u0627',  #faia
        u'\u0645\u0627\u0630\u0627',  #maia
        u'\u0648\u0627\u0630',  #wai
        u'\u0648\u0627\u0630\u0627',  #waia
        u'\u0644\u0648\u0644\u0627',  #lwla
        u'\u0644\u0648',  #lw
        u'\u0648\u0644\u0648\u0633\u0648\u0641',  #wlwswf
        u'\u0644\u0646',  #ln
        u'\u0645\u0627',  #ma
        u'\u0644\u0645',  #lm
        u'\u0648\u0644\u0645',  #wlm
        u'\u0623\u0645\u0627',  #ama
        u'\u0627\u0645\u0627',  #ama
        u'\u0644\u0627',  #la
        u'\u0625\u0644\u0627',  #ala
        u'\u0627\u0644\u0627',  #ala
        u'\u0623\u0645',  #am
        u'\u0623\u0648',  #aw
        u'\u0627\u0645',  #am
        u'\u0627\u0648',  #aw
        u'\u0628\u0644',  #bl
        u'\u0623\u0646\u0645\u0627',  #anma
        u'\u0625\u0646\u0645\u0627',  #anma
        u'\u0628\u0644',  #bl
        u'\u0627\u0646\u0645\u0627',  #anma
        u'\u0627\u0646\u0645\u0627',  #anma
        u'\u0648'  #w
        ]

    # Connectors
    connectors = [u'\u0628\u0645\u0627',  #bma
        u'\u0643\u0645\u0627',  #kma
        u'\u0644\u0645\u0627',  #lma
        u'\u0644\u0623\u0646',  #lan
        u'\u0644\u064a', #ly
        u'\u0644\u0649', #ly
        u'\u0644\u0623\u0646\u0647',  #lanh
        u'\u0644\u0623\u0646\u0647\u0627',  #lanha
        u'\u0644\u0623\u0646\u0647\u0645',  #lanhm
        u'\u0644\u0627\u0646',  #lan
        u'\u0644\u0627\u0646\u0647',  #lanh
        u'\u0644\u0627\u0646\u0647\u0627',  #lanha
        u'\u0644\u0627\u0646\u0647\u0645',  #lanhm
        u'\u062b\u0645',  #Um
        u'\u0623\u064a\u0636\u0627',  #ayDa
        u'\u0627\u064a\u0636\u0627',  #ayDa
        u'\u0643\u0630\u0644\u0643',  #kilk
        u'\u0642\u0628\u0644',  #Qbl
        u'\u0628\u0639\u062f',  #b3d
        u'\u0644\u0643\u0646',  #lkn
        u'\u0648\u0644\u0643\u0646',  #wlkn
        u'\u0644\u0643\u0646\u0647',  #lknh
        u'\u0644\u0643\u0646\u0647\u0627',  #lknha
        u'\u0644\u0643\u0646\u0647\u0645',  #lknhm
        u'\u0641\u0642\u0637',  #fQT
        u'\u0631\u063a\u0645',  #rGm
        u'\u0628\u0627\u0644\u0631\u063a\u0645',  #balrGm
        u'\u0628\u0641\u0636\u0644',  #bfDl
        u'\u062d\u064a\u062b',  #7yU
        u'\u0628\u062d\u064a\u062b',  #b7yU
        u'\u0644\u0643\u064a',  #lky
        u'\u0647\u0646\u0627',  #hna
        u'\u0647\u0646\u0627\u0643',  #hnak
        u'\u0628\u0633\u0628\u0628',  #bsbb
        u'\u0630\u0627\u062a',  #iat
        u'\u0630\u0648',  #iw
        u'\u0630\u064a',  #iy
        u'\u0630\u0649',  #iy
        u'\u0648\u0647', #wh
        u'\u064a\u0627',  #ya
        u'\u0627\u0646\u0645\u0627',  #anma
        u'\u0641\u0647\u0630\u0627',  #fhia
        u'\u0641\u0647\u0648',  #fhw
        u'\u0641\u0645\u0627',  #fma
        u'\u0641\u0645\u0646',  #fmn
        u'\u0641\u064a\u0645\u0627', #fyma
        u'\u0641\u0647\u0644',  #fhl
        u'\u0648\u0647\u0644',  #fhl
        u'\u0641\u0647\u0624\u0644\u0627\u0621',  #fholaq
        u'\u0643\u0630\u0627', #kia
        u'\u0644\u0630\u0644\u0643', #lilk
        u'\u0644\u0645\u0627\u0630\u0627', #lmaia
        u'\u0644\u0645\u0646', #lmn
        u'\u0644\u0646\u0627',  #lna
        u'\u0645\u0646\u0627',  #mna
        u'\u0645\u0646\u0643',  #mnk
        u'\u0645\u0646\u0643\u0645',  #mnkm
        u'\u0645\u0646\u0647\u0645\u0627',  #mnhma
        u'\u0644\u0643', #lk
        u'\u0648\u0644\u0648', #wlw
        u'\u0645\u0645\u0627', #mma
        u'\u0639\u0646\u062f',  #3nd
        u'\u0639\u0646\u062f\u0647\u0645',  #3ndhm
        u'\u0639\u0646\u062f\u0645\u0627',  #3ndma
        u'\u0639\u0646\u062f\u0646\u0627',  #3ndna
        u'\u0639\u0646\u0647\u0645\u0627',  #3nhma
        u'\u0639\u0646\u0643',  #3nk
        u'\u0627\u0630\u0646',  #ain
        u'\u0627\u0644\u0630\u064a',  #aliy
        u'\u0641\u0627\u0646\u0627',  #fana
        u'\u0641\u0627\u0646\u0647\u0645',  #fanhm
        u'\u0641\u0647\u0645',  #fhm
        u'\u0641\u0647',  #fh
        u'\u0641\u0643\u0644',  #fkl
        u'\u0644\u0643\u0644',  #lkl
        u'\u0644\u0643\u0645',  #lkm
        u'\u0641\u0644\u0645',  #flm
        u'\u0641\u0644\u0645\u0627',  #flma
        u'\u0641\u064a\u0643',  #fyk
        u'\u0641\u064a\u0643\u0645',  #fykm
        u'\u0644\u0647\u0630\u0627'    # lhia
        ]

    all = preps+pronouns+particles+connectors
    all = ' '.join(all)
    all = all+ ' ' + fixAlifs(all)
    all = list(set(all.split(' ')))
    
    for i in range(len(texts_split)):
        word = texts_split[i] 
        if word in all :
            texts_split[i] = ''

    # Rejoining the texts again...
    texts = ''.join([word + " "  for word in texts_split])
    # split to get rid of white space
    #texts_split = new_texts.split()
    # then re-rejoin them.
    #out_texts = ''.join([word + " "  for word in texts_split])
    ## clean up spaces
    return(re.sub(' {2,}', ' ', texts))

Example 86

Project: termite-data-server Source File: template.py
Function: parse
    def parse(self, text):

        # Basically, r_tag.split will split the text into
        # an array containing, 'non-tag', 'tag', 'non-tag', 'tag'
        # so if we alternate this variable, we know
        # what to look for. This is alternate to
        # line.startswith("{{")
        in_tag = False
        extend = None
        pre_extend = True

        # Use a list to store everything in
        # This is because later the code will "look ahead"
        # for missing strings or brackets.
        ij = self.r_tag.split(text)
        # j = current index
        # i = current item
        stack = self.stack
        for j in range(len(ij)):
            i = ij[j]

            if i:
                if not stack:
                    self._raise_error('The "end" tag is unmatched, please check if you have a starting "block" tag')

                # Our current element in the stack.
                top = stack[-1]

                if in_tag:
                    line = i

                    # Get rid of delimiters
                    line = line[len(self.delimiters[0]):-len(self.delimiters[1])].strip()

                    # This is bad juju, but let's do it anyway
                    if not line:
                        continue

                    # We do not want to replace the newlines in code,
                    # only in block comments.
                    def remove_newline(re_val):
                        # Take the entire match and replace newlines with
                        # escaped newlines.
                        return re_val.group(0).replace('\n', '\\n')

                    # Perform block comment escaping.
                    # This performs escaping ON anything
                    # in between """ and """
                    line = sub(TemplateParser.r_multiline,
                               remove_newline,
                               line)

                    if line.startswith('='):
                        # IE: {{=response.title}}
                        name, value = '=', line[1:].strip()
                    else:
                        v = line.split(' ', 1)
                        if len(v) == 1:
                            # Example
                            # {{ include }}
                            # {{ end }}
                            name = v[0]
                            value = ''
                        else:
                            # Example
                            # {{ block pie }}
                            # {{ include "layout.html" }}
                            # {{ for i in range(10): }}
                            name = v[0]
                            value = v[1]

                    # This will replace newlines in block comments
                    # with the newline character. This is so that they
                    # retain their formatting, but squish down to one
                    # line in the rendered template.

                    # First check if we have any custom lexers
                    if name in self.lexers:
                        # Pass the information to the lexer
                        # and allow it to inject in the environment

                        # You can define custom names such as
                        # '{{<<variable}}' which could potentially
                        # write unescaped version of the variable.
                        self.lexers[name](parser=self,
                                          value=value,
                                          top=top,
                                          stack=stack)

                    elif name == '=':
                        # So we have a variable to insert into
                        # the template
                        buf = "\n%s(%s)" % (self.writer, value)
                        top.append(Node(buf, pre_extend=pre_extend))

                    elif name == 'block' and not value.startswith('='):
                        # Make a new node with name.
                        node = BlockNode(name=value.strip(),
                                         pre_extend=pre_extend,
                                         delimiters=self.delimiters)

                        # Append this node to our active node
                        top.append(node)

                        # Make sure to add the node to the stack.
                        # so anything after this gets added
                        # to this node. This allows us to
                        # "nest" nodes.
                        stack.append(node)

                    elif name == 'end' and not value.startswith('='):
                        # We are done with this node.

                        # Save an instance of it
                        self.blocks[top.name] = top

                        # Pop it.
                        stack.pop()

                    elif name == 'super' and not value.startswith('='):
                        # Get our correct target name
                        # If they just called {{super}} without a name
                        # attempt to assume the top blocks name.
                        if value:
                            target_node = value
                        else:
                            target_node = top.name

                        # Create a SuperNode instance
                        node = SuperNode(name=target_node,
                                         pre_extend=pre_extend)

                        # Add this to our list to be taken care of
                        self.super_nodes.append(node)

                        # And put in in the tree
                        top.append(node)

                    elif name == 'include' and not value.startswith('='):
                        # If we know the target file to include
                        if value:
                            self.include(top, value)

                        # Otherwise, make a temporary include node
                        # That the child node will know to hook into.
                        else:
                            include_node = BlockNode(
                                name='__include__' + self.name,
                                pre_extend=pre_extend,
                                delimiters=self.delimiters)
                            top.append(include_node)

                    elif name == 'extend' and not value.startswith('='):
                        # We need to extend the following
                        # template.
                        extend = value
                        pre_extend = False

                    else:
                        # If we don't know where it belongs
                        # we just add it anyways without formatting.
                        if line and in_tag:

                            # Split on the newlines >.<
                            tokens = line.split('\n')

                            # We need to look for any instances of
                            # for i in range(10):
                            #   = i
                            # pass
                            # So we can properly put a response.write() in place.
                            continuation = False
                            len_parsed = 0
                            for k, token in enumerate(tokens):

                                token = tokens[k] = token.strip()
                                len_parsed += len(token)

                                if token.startswith('='):
                                    if token.endswith('\\'):
                                        continuation = True
                                        tokens[k] = "\n%s(%s" % (
                                            self.writer, token[1:].strip())
                                    else:
                                        tokens[k] = "\n%s(%s)" % (
                                            self.writer, token[1:].strip())
                                elif continuation:
                                    tokens[k] += ')'
                                    continuation = False

                            buf = "\n%s" % '\n'.join(tokens)
                            top.append(Node(buf, pre_extend=pre_extend))

                else:
                    # It is HTML so just include it.
                    buf = "\n%s(%r, escape=False)" % (self.writer, i)
                    top.append(Node(buf, pre_extend=pre_extend))

            # Remember: tag, not tag, tag, not tag
            in_tag = not in_tag

        # Make a list of items to remove from child
        to_rm = []

        # Go through each of the children nodes
        for node in self.child_super_nodes:
            # If we declared a block that this node wants to include
            if node.name in self.blocks:
                # Go ahead and include it!
                node.value = self.blocks[node.name]
                # Since we processed this child, we don't need to
                # pass it along to the parent
                to_rm.append(node)

        # Remove some of the processed nodes
        for node in to_rm:
            # Since this is a pointer, it works beautifully.
            # Sometimes I miss C-Style pointers... I want my asterisk...
            self.child_super_nodes.remove(node)

        # If we need to extend a template.
        if extend:
            self.extend(extend)

Example 87

Project: hue Source File: create_table.py
def import_wizard(request, database='default'):
  """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
  encoding = i18n.get_site_encoding()
  app_name = get_app_name(request)

  db = dbms.get(request.user)
  dbs = db.get_databases()
  databases = [{'name':db, 'url':reverse('beeswax:import_wizard', kwargs={'database': db})} for db in dbs]

  if request.method == 'POST':
    #
    # General processing logic:
    # - We have 3 steps. Each requires the previous.
    #   * Step 1      : Table name and file location
    #   * Step 2a     : Display sample with auto chosen delim
    #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
    #   * Step 3      : Display sample, and define columns
    # - Each step is represented by a different form. The form of an earlier step
    #   should be present when submitting to a later step.
    # - To preserve the data from the earlier steps, we send the forms back as
    #   hidden fields. This way, when users revisit a previous step, the data would
    #   be there as well.
    #
    delim_is_auto = False
    fields_list, n_cols = [[]], 0
    s3_col_formset = None
    s1_file_form = CreateByImportFileForm(request.POST, db=db)

    if s1_file_form.is_valid():
      do_s2_auto_delim = request.POST.get('submit_file')        # Step 1 -> 2
      do_s2_user_delim = request.POST.get('submit_preview')     # Step 2 -> 2
      do_s3_column_def = request.POST.get('submit_delim')       # Step 2 -> 3
      do_hive_create = request.POST.get('submit_create')        # Step 3 -> execute

      cancel_s2_user_delim = request.POST.get('cancel_delim')   # Step 2 -> 1
      cancel_s3_column_def = request.POST.get('cancel_create')  # Step 3 -> 2

      # Exactly one of these should be True
      if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1:
        raise PopupException(_('Invalid form submission'))

      if not do_s2_auto_delim:
        # We should have a valid delim form
        s2_delim_form = CreateByImportDelimForm(request.POST)
        if not s2_delim_form.is_valid():
          # Go back to picking delimiter
          do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False
      if do_hive_create:
        # We should have a valid columns formset
        s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST)
        if not s3_col_formset.is_valid():
          # Go back to define columns
          do_s3_column_def, do_hive_create = True, False

      load_data = s1_file_form.cleaned_data.get('load_data', 'IMPORT').upper()
      path = s1_file_form.cleaned_data['path']

      #
      # Go to step 2: We've just picked the file. Preview it.
      #
      if do_s2_auto_delim:
        try:
          if load_data == 'IMPORT':
            if not request.fs.isfile(path):
              raise PopupException(_('Path location must refer to a file if "Import Data" is selected.'))
          elif load_data == 'EXTERNAL':
            if not request.fs.isdir(path):
              raise PopupException(_('Path location must refer to a directory if "Create External Table" is selected.'))
        except (IOError, S3FileSystemException), e:
          raise PopupException(_('Path location "%s" is invalid: %s') % (path, e))

        delim_is_auto = True
        fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS)

      if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid():
        # Delimit based on input
        fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],),
                                                            (s2_delim_form.cleaned_data['delimiter'],))

      if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def:
        return render('import_wizard_choose_delimiter.mako', request, {
          'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
          'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]),
          'initial': delim_is_auto,
          'file_form': s1_file_form,
          'delim_form': s2_delim_form,
          'fields_list': fields_list,
          'delimiter_choices': TERMINATOR_CHOICES,
          'n_cols': n_cols,
          'database': database,
          'databases': databases
        })

      #
      # Go to step 3: Define column.
      #
      if do_s3_column_def:
        if s3_col_formset is None:
          columns = []
          for i in range(n_cols):
            columns.append({
                'column_name': 'col_%s' % (i,),
                'column_type': 'string',
            })
          s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns)
        try:
          fields_list_for_json = list(fields_list)
          if fields_list_for_json:
            fields_list_for_json[0] = map(lambda a: re.sub('[^\w]', '', a), fields_list_for_json[0]) # Cleaning headers

          return render('import_wizard_define_columns.mako', request, {
            'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
            'file_form': s1_file_form,
            'delim_form': s2_delim_form,
            'column_formset': s3_col_formset,
            'fields_list': fields_list,
            'fields_list_json': json.dumps(fields_list_for_json),
            'n_cols': n_cols,
            'database': database,
            'databases': databases
          })
        except Exception, e:
          raise PopupException(_("The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns."), detail=e)

      #
      # Final: Execute
      #
      if do_hive_create:
        delim = s2_delim_form.cleaned_data['delimiter']
        table_name = s1_file_form.cleaned_data['name']

        proposed_query = django_mako.render_to_string("create_table_statement.mako", {
            'table': {
                'name': table_name,
                'comment': s1_file_form.cleaned_data['comment'],
                'row_format': 'Delimited',
                'field_terminator': delim,
                'file_format': 'TextFile',
                'load_data': load_data,
                'path': path,
                'skip_header': request.REQUEST.get('removeHeader', 'off').lower() == 'on'
             },
            'columns': [ f.cleaned_data for f in s3_col_formset.forms ],
            'partition_columns': [],
            'database': database,
            'databases': databases
          }
        )
        try:
          return _submit_create_and_load(request, proposed_query, table_name, path, load_data, database=database)
        except QueryServerException, e:
          raise PopupException(_('The table could not be created.'), detail=e.message)
  else:
    s1_file_form = CreateByImportFileForm()

  return render('import_wizard_choose_file.mako', request, {
    'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
    'file_form': s1_file_form,
    'database': database,
    'databases': databases
  })

Example 88

Project: deep_recommend_system Source File: parsing_ops.py
def _parse_single_sequence_example_raw(serialized,
                                       context_sparse_keys=None,
                                       context_sparse_types=None,
                                       context_dense_keys=None,
                                       context_dense_types=None,
                                       context_dense_defaults=None,
                                       context_dense_shapes=None,
                                       feature_list_sparse_keys=None,
                                       feature_list_sparse_types=None,
                                       feature_list_dense_keys=None,
                                       feature_list_dense_types=None,
                                       feature_list_dense_shapes=None,
                                       feature_list_dense_defaults=None,
                                       debug_name=None,
                                       name=None):
  """Parses a single `SequenceExample` proto.

  Args:
    serialized: A scalar (0-D Tensor) of type string, a single binary
      serialized `SequenceExample` proto.
    context_sparse_keys: A list of string keys in the `SequenceExample`'s
      features.  The results for these keys will be returned as
      `SparseTensor` objects.
    context_sparse_types: A list of `DTypes`, the same length as `sparse_keys`.
      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
      and `tf.string` (`BytesList`) are supported.
    context_dense_keys: A list of string keys in the examples' features.
      The results for these keys will be returned as `Tensor`s
    context_dense_types: A list of DTypes, same length as `context_dense_keys`.
      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
      and `tf.string` (`BytesList`) are supported.
    context_dense_defaults: A dict mapping string keys to `Tensor`s.
      The keys of the dict must match the context_dense_keys of the feature.
    context_dense_shapes: A list of tuples, same length as `context_dense_keys`.
      The shape of the data for each context_dense feature referenced by
      `context_dense_keys`.  Required for any input tensors identified by
      `context_dense_keys` whose shapes are anything other than `[]` or `[1]`.
    feature_list_sparse_keys: A list of string keys in the `SequenceExample`'s
      feature_lists.  The results for these keys will be returned as
      `SparseTensor` objects.
    feature_list_sparse_types: A list of `DTypes`, same length as `sparse_keys`.
      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
      and `tf.string` (`BytesList`) are supported.
    feature_list_dense_keys: A list of string keys in the `SequenceExample`'s
      features_lists. The results for these keys will be returned as `Tensor`s.
    feature_list_dense_types: A list of `DTypes`, same length as
      `feature_list_dense_keys`.  Only `tf.float32` (`FloatList`),
      `tf.int64` (`Int64List`), and `tf.string` (`BytesList`) are supported.
    feature_list_dense_shapes: A list of tuples, same length as
      `feature_list_dense_keys`.  The shape of the data for each
      `FeatureList` feature referenced by `feature_list_dense_keys`.
    feature_list_dense_defaults: A dict mapping key strings to values.
      The only currently allowed value is `None`.  Any key appearing
      in this dict with value `None` is allowed to be missing from the
      `SequenceExample`.  If missing, the key is treated as zero-length.
    debug_name: A scalar (0-D Tensor) of strings (optional), the name of
      the serialized proto.
    name: A name for this operation (optional).

  Returns:
    A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s.
    The first dict contains the context key/values.
    The second dict contains the feature_list key/values.

  Raises:
    ValueError: If context_sparse and context_dense key sets intersect,
      if input lengths do not match up, or if a value in
      feature_list_dense_defaults is not None.
    TypeError: if feature_list_dense_defaults is not either None or a dict.
  """
  with ops.name_scope(name, "ParseSingleSequenceExample", [serialized]):
    context_dense_defaults = (
        {} if context_dense_defaults is None else context_dense_defaults)
    context_sparse_keys = (
        [] if context_sparse_keys is None else context_sparse_keys)
    context_sparse_types = (
        [] if context_sparse_types is None else context_sparse_types)
    context_dense_keys = (
        [] if context_dense_keys is None else context_dense_keys)
    context_dense_types = (
        [] if context_dense_types is None else context_dense_types)
    context_dense_shapes = (
        [[]] * len(context_dense_keys)
        if context_dense_shapes is None else context_dense_shapes)
    feature_list_sparse_keys = (
        [] if feature_list_sparse_keys is None else feature_list_sparse_keys)
    feature_list_sparse_types = (
        [] if feature_list_sparse_types is None else feature_list_sparse_types)
    feature_list_dense_keys = (
        [] if feature_list_dense_keys is None else feature_list_dense_keys)
    feature_list_dense_types = (
        [] if feature_list_dense_types is None else feature_list_dense_types)
    feature_list_dense_shapes = (
        [[]] * len(feature_list_dense_keys)
        if feature_list_dense_shapes is None else feature_list_dense_shapes)
    feature_list_dense_defaults = (
        dict() if feature_list_dense_defaults is None
        else feature_list_dense_defaults)
    debug_name = "" if debug_name is None else debug_name

    # Internal
    feature_list_dense_missing_assumed_empty = []

    num_context_dense = len(context_dense_keys)
    num_feature_list_dense = len(feature_list_dense_keys)
    num_context_sparse = len(context_sparse_keys)
    num_feature_list_sparse = len(feature_list_sparse_keys)

    if len(context_dense_shapes) != num_context_dense:
      raise ValueError(
          "len(context_dense_shapes) != len(context_dense_keys): %d vs. %d"
          % (len(context_dense_shapes), num_context_dense))
    if len(context_dense_types) != num_context_dense:
      raise ValueError(
          "len(context_dense_types) != len(num_context_dense): %d vs. %d"
          % (len(context_dense_types), num_context_dense))
    if len(feature_list_dense_shapes) != num_feature_list_dense:
      raise ValueError(
          "len(feature_list_dense_shapes) != len(feature_list_dense_keys): "
          "%d vs. %d" % (len(feature_list_dense_shapes),
                         num_feature_list_dense))
    if len(feature_list_dense_types) != num_feature_list_dense:
      raise ValueError(
          "len(feature_list_dense_types) != len(num_feature_list_dense):"
          "%d vs. %d" % (len(feature_list_dense_types), num_feature_list_dense))
    if len(context_sparse_types) != num_context_sparse:
      raise ValueError(
          "len(context_sparse_types) != len(context_sparse_keys): %d vs. %d"
          % (len(context_sparse_types), num_context_sparse))
    if len(feature_list_sparse_types) != num_feature_list_sparse:
      raise ValueError(
          "len(feature_list_sparse_types) != len(feature_list_sparse_keys): "
          "%d vs. %d"
          % (len(feature_list_sparse_types), num_feature_list_sparse))
    if (num_context_dense + num_context_sparse
        + num_feature_list_dense + num_feature_list_sparse) == 0:
      raise ValueError(
          "Must provide at least one context_sparse key, context_dense key, "
          ", feature_list_sparse key, or feature_list_dense key")
    if not set(context_dense_keys).isdisjoint(set(context_sparse_keys)):
      raise ValueError(
          "context_dense and context_sparse keys must not intersect; "
          "intersection: %s" %
          set(context_dense_keys).intersection(set(context_sparse_keys)))
    if not set(feature_list_dense_keys).isdisjoint(
        set(feature_list_sparse_keys)):
      raise ValueError(
          "feature_list_dense and feature_list_sparse keys must not intersect; "
          "intersection: %s" %
          set(feature_list_dense_keys).intersection(
              set(feature_list_sparse_keys)))
    if not isinstance(feature_list_dense_defaults, dict):
      raise TypeError("feature_list_dense_defaults must be a dict")
    for k, v in feature_list_dense_defaults.items():
      if v is not None:
        raise ValueError("Value feature_list_dense_defaults[%s] must be None"
                         % k)
      feature_list_dense_missing_assumed_empty.append(k)

    context_dense_defaults_vec = []
    for i, key in enumerate(context_dense_keys):
      default_value = context_dense_defaults.get(key)
      if default_value is None:
        default_value = constant_op.constant([], dtype=context_dense_types[i])
      elif not isinstance(default_value, ops.Tensor):
        key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
        default_value = ops.convert_to_tensor(
            default_value, dtype=context_dense_types[i], name=key_name)
        default_value = array_ops.reshape(
            default_value, context_dense_shapes[i])

      context_dense_defaults_vec.append(default_value)

    context_dense_shapes = [tensor_shape.as_shape(shape).as_proto()
                            for shape in context_dense_shapes]
    feature_list_dense_shapes = [tensor_shape.as_shape(shape).as_proto()
                                 for shape in feature_list_dense_shapes]

    # pylint: disable=protected-access
    outputs = gen_parsing_ops._parse_single_sequence_example(
        serialized=serialized,
        debug_name=debug_name,
        context_dense_defaults=context_dense_defaults_vec,
        context_sparse_keys=context_sparse_keys,
        context_sparse_types=context_sparse_types,
        context_dense_keys=context_dense_keys,
        context_dense_shapes=context_dense_shapes,
        feature_list_sparse_keys=feature_list_sparse_keys,
        feature_list_sparse_types=feature_list_sparse_types,
        feature_list_dense_keys=feature_list_dense_keys,
        feature_list_dense_types=feature_list_dense_types,
        feature_list_dense_shapes=feature_list_dense_shapes,
        feature_list_dense_missing_assumed_empty=(
            feature_list_dense_missing_assumed_empty),
        name=name)
    # pylint: enable=protected-access

    (context_sparse_indices, context_sparse_values,
     context_sparse_shapes, context_dense_values,
     feature_list_sparse_indices, feature_list_sparse_values,
     feature_list_sparse_shapes, feature_list_dense_values) = outputs

    context_sparse_tensors = [
        sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape)
        in zip(context_sparse_indices,
               context_sparse_values,
               context_sparse_shapes)]

    feature_list_sparse_tensors = [
        sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape)
        in zip(feature_list_sparse_indices,
               feature_list_sparse_values,
               feature_list_sparse_shapes)]

    context_output = dict(
        zip(context_sparse_keys + context_dense_keys,
            context_sparse_tensors + context_dense_values))
    feature_list_output = dict(
        zip(feature_list_sparse_keys + feature_list_dense_keys,
            feature_list_sparse_tensors + feature_list_dense_values))

    return (context_output, feature_list_output)

Example 89

Project: libtrack Source File: parsing.py
def yield_paths(filename, timing_info=None, filter_calls=[]):
    """Retrieves full paths from a trace file

    For input a trace file returns all full  paths ending
    in a POSIX call, until the end of the file. Full paths are
    returned as lists of elements function calls along with the
    respective library name.
    For example, "__libc_init:libc.so" is the signature for
    the call "__libc_init" bellonging to lib "libc.so"

    Args:
        filename

    Returns:
        fullpath as a list

    Raises:
        Exception: if log file is corrupted
    """
    try:
        f = open(filename)
    except IOError, error:
        print >> sys.stderr, "I/O error while opening file: %s" % error
        return

    cache = Cache()
    for line in f:
        try:
            labels = line.split(':')[:3]
        except Exception, error:
            print >> sys.stderr, "Unhandled Exception:", error, filename
            continue
            #return
        if labels[1:] in [['BT', 'REPEAT']]:
            try:
                cache.repeat_next = int(line.split(':')[3])
            except Exception, error:
                #print >> sys.stderr, "Unhandled Exception:", error, filename
                continue
                #return

        elif labels[1:] in [['BT', 'START']]:
           # flush the bloody cache
            if cache.dirty_native_path:
                if cache.native_path[-1] in filter_calls:
                    cache.dirty_native_path = False
                    cache.dirty_dalvik_path = False
                    cache.repeat_current = cache.repeat_next
                    cache.repeat_next = 1
                    continue
                # seperately retrieve fd type for open
                if cache.native_path[-1] == 'open:libc.so':
                    cache.native_path[-1] = 'open_' + cache.fd_buff + ':libc.so'
                for _ in range(cache.repeat_current):
                    #yield cache.native_path
                    if cache.dirty_dalvik_path:
                        yield cache.dalvik_path + cache.native_path 
                    else:
                        yield cache.native_path
                cache.dirty_native_path = False
                cache.dirty_dalvik_path = False
                cache.repeat_current = cache.repeat_next
                cache.repeat_next = 1
            try:
                frames = int(line.split(':')[3])
                cache.set_native(f, frames)
            except (ValueError, IndexError), error:
                #print >>  sys.stderr, "Error:", error, "in file:", filename
                continue
                #return
            except StopIteration, error:
                #print >> sys.stderr, "Error:", error, "unexpected end in File:", filename
                continue
                #return
            except Exception as error:
                #print >> sys.stderr, "Unhandled Exception:", error, "in file:", filename
                continue
                #return

        elif labels[1:] in [['DVM', 'BT_START']]:
            try:
                frames = int(line.split(':')[3])
                cache.set_dalvik(f, frames)
            except (ValueError, IndexError), error:
                #print >> sys.stderr, "Error:", error, "in file:", filename
                continue
                #return
            except StopIteration, error:
                #print >> sys.stderr, "Error:", error, "unexpected end in File:", filename
                continue
                #return
            except Exception, error:
                #print >> sys.stderr, "Unhandled Exception:", error, "in file:", filename
                continue
                #return

        elif labels[1:] in [['LOG', 'I']]:
            try:
                cache.set_fd_buff(line)
            except (ValueError, IndexError), error:
                #print >> sys.stderr, "Error:", error, "in file:", filename
                continue
                #return
            except StopIteration, error:
                #print >> sys.stderr, "Error:", error, "unexpected end in File:", filename
                continue
                #return
            except Exception, error:
                #print >> sys.stderr, "Unhandled Exception:", error, "in file:", filename
                continue
                #return

        elif labels[1:] in [['BT_REPEAT', '1']]:
            cache.dirty_dalvik_path = True

        elif labels[1:2] in [['CALL']]:
            if len(line.split('.')) > 2:
                continue
            try:
                yield ["NO_BACKTRACE:NO_BACKTRACE", labels[2].split('\n')[0] + ':libc.so']
            except Exception, error:
                #print >> sys.stderr, "Unhandled Exception:", error, "in file:", filename
                continue
                #return

        elif labels[1:] in [['LOG', 'T']]:
            if timing_info == None:
                continue
            # nothing else I can do now for this kind of corrupted logs     
            #if len(line.split(':')) != 5:
            #    continue
            try:
                call = line.split(':')[3] + ":libc.so"
                if call in filter_calls:
                    continue
                time = line.split(':')[4].split('\n')[0]
                nsec = 1000000000*int(time.split('.')[0]) + int(time.split('.')[1])
            except Exception, error:
                print >> sys.stderr, "Unhandled Exception:", error, "in file:", filename
                continue
                #return
            # special handling of epoll - it should go to cache later
            if call[:11] == "epoll_wait_":
                call = "epoll_wait_:libc.so"
            else:
                # this regexec makes things sooooo slow
                call = re.sub(r'_B:|_D:|_E:|_F:|_f:|_K:|_k:|_P:|_p:|_S:|_U:', ':', call)

            if call not in timing_info:
                timing_info[call] = [nsec]
            else:
                timing_info[call].append(nsec)

        else:
            continue

    # flush cache
    if cache.dirty_native_path:
        # seperately retrieve fd type for open
        if cache.native_path[-1] == 'open:libc.so':
            cache.native_path[-1] = 'open_' + cache.fd_buff + ':libc.so'
        for _ in range(cache.repeat_current):
            #yield cache.native_path
            if cache.dirty_dalvik_path:
                yield cache.dalvik_path + cache.native_path
            else:
                yield cache.native_path
    f.close()

Example 90

Project: viper Source File: emailparse.py
Function: run
    def run(self, *args):

        def string_clean(value):
            if value:
                return re.sub('[\n\t\r]', '', value)
            return ""

        def parse_ole_msg(ole):
            stream_dirs = ole.listdir()
            for stream in stream_dirs:
                # get stream that contains the email header
                if stream[0].startswith('__substg1.0_007D'):
                    email_header = ole.openstream(stream).read()
                    if stream[0].endswith('001F'):  # Unicode probably needs something better than just stripping \x00
                        email_header = email_header.replace('\x00', '')
            # If it came from outlook we may need to trim some lines
            try:
                email_header = email_header.split('Version 2.0\x0d\x0a', 1)[1]
            except:
                pass

            # Leaving us an RFC compliant email to parse
            msg = email.message_from_string(email_header)
            return msg

        def parse_ole_attachments(ole):
            # Hard part now, each part of the attachment is in a seperate stream

            # need to get a unique stream id for each att
            # its in the streamname as an 8 digit number.
            for i in range(20):  # arbitrary count of emails. i dont expecet this many
                stream_number = str(i).zfill(8)
                stream_name = '__attach_version1.0_#' + stream_number
                # Unicode
                try:
                    att_filename = ole.openstream(stream_name + '/__substg1.0_3704001F').read()
                    att_mime = ole.openstream(stream_name + '/__substg1.0_370E001F').read()
                    att_data = ole.openstream(stream_name + '/__substg1.0_37010102').read()
                    att_size = len(att_data)
                    att_md5 = hashlib.md5(att_data).hexdigest()
                    print i, att_size, att_md5, att_filename, att_mime
                except:
                    pass
                # ASCII
                try:
                    att_filename = ole.openstream(stream_name + '/__substg1.0_3704001E').read()
                    att_mime = ole.openstream(stream_name + '/__substg1.0_370E001E').read()
                    att_data = ole.openstream(stream_name + '/__substg1.0_37010102').read()
                    att_size = len(att_data)
                    att_md5 = hashlib.md5(att_data).hexdigest()
                    print i, att_size, att_md5, att_filename, att_mime
                except:
                    pass

        def att_session(att_id, msg, ole_flag):
            att_count = 0
            if ole_flag:
                ole = msg
                # Hard part now, each part of the attachment is in a seperate stream

                # need to get a unique stream id for each att
                # its in the streamname as an 8 digit number.
                for i in range(20):  # arbitrary count of emails. i dont expecet this many
                    stream_number = str(i).zfill(8)
                    stream_name = '__attach_version1.0_#' + stream_number
                    # Unicode
                    try:
                        att_filename = ole.openstream(stream_name + '/__substg1.0_3704001F').read()
                        att_filename = att_filename.replace('\x00', '')
                        att_data = ole.openstream(stream_name + '/__substg1.0_37010102').read()
                    except:
                        pass
                    # ASCII
                    try:
                        att_filename = ole.openstream(stream_name + '/__substg1.0_3704001E').read()
                        att_data = ole.openstream(stream_name + '/__substg1.0_37010102').read()
                    except:
                        pass
                    if i == att_id:
                        self.log('info', "Switching session to {0}".format(att_filename))
                        tmp_path = os.path.join(tempfile.gettempdir(), att_filename)
                        with open(tmp_path, 'w') as tmp:
                            tmp.write(att_data)
                        __sessions__.new(tmp_path)
                        return

            else:
                for part in msg.walk():
                    if part.get_content_type() == 'message/rfc822':
                        rfc822 = True
                    else:
                        rfc822 = False

                    if part.get_content_maintype() == 'multipart' \
                        or not part.get('Content-Disposition') \
                            and not rfc822:
                        continue

                    att_count += 1
                    if att_count == att_id:
                        if rfc822:
                            data = part.as_string()
                            m = re.match("Content-Type: message/rfc822\r?\n\r?\n(.*)", data, flags=re.S)
                            if not m:
                                self.log('error', "Could not extract RFC822 formatted message")
                                return
                            data = m.group(1)
                            att_size = len(data)
                            filename = "rfc822msg_{0}.eml".format(att_size)
                        else:
                            data = part.get_payload(decode=True)
                            filename = part.get_filename()

                        self.log('info', "Switching session to {0}".format(filename))

                        if data:
                            tmp_path = os.path.join(tempfile.gettempdir(), filename)
                            with open(tmp_path, 'w') as tmp:
                                tmp.write(data)
                            __sessions__.new(tmp_path)
                            return

        def email_envelope(msg):
            # Envelope
            self.log('info', "Email envelope:")
            rows = [
                ['Subject', msg.get("Subject")],
                ['To', msg.get("To")],
                ['From', msg.get("From")],
                ['Cc', msg.get("Cc")],
                ['Bcc', msg.get("Bcc")],
                ['Date', msg.get("Date")]
            ]
            self.log('table', dict(header=['Key', 'Value'], rows=rows))
            return

        def email_header(msg):
            # Headers
            rows = []
            for x in msg.keys():
                # Adding Received to ignore list. this has to be handeled separately if there are more then one line
                if x not in ['Subject', 'From', 'To', 'Date', 'Cc', 'Bcc', 'DKIM-Signature', 'Received']:
                    rows.append([x, string_clean(msg.get(x))])
            for x in msg.get_all('Received'):
                rows.append(['Received', string_clean(x)])
            self.log('info', "Email headers:")
            rows = sorted(rows, key=lambda entry: entry[0])
            self.log('table', dict(header=['Key', 'Value'], rows=rows))
            return

        def email_trace(msg, verbose):
            rows = []
            if verbose:
                fields = ['from', 'by', 'with', 'id', 'for', 'timestamp']
            else:
                fields = ['from', 'by', 'timestamp']
            for x in msg.get_all('Received'):
                x = string_clean(x)
                cre = re.compile("""
                    (?: from \s+ (?P<from>.*?) (?=by|with|id|ID|for|;|$) )?
                    (?: by \s+ (?P<by>.*?) (?=with|id|ID|for|;|$) )?
                    (?: with \s+ (?P<with>.*?) (?=id|ID|for|;|$) )?
                    (?: (id|ID) \s+ (?P<id>.*?) (?=for|;|$) )?
                    (?: for \s+ (?P<for>.*?) (?=;|$) )?
                    (?: \s* ; \s* (?P<timestamp>.*) )?
                    """, flags=re.X | re.I)
                m = cre.search(x)
                if not m:
                    self.log('error', "Received header regex didn't match")
                    return
                t = []
                for groupname in fields:
                    t.append(string_clean(m.group(groupname)))
                rows.insert(0, t)
            self.log('info', "Email path trace:")
            self.log('table', dict(header=fields, rows=rows))
            return

        def email_spoofcheck(msg, dnsenabled):
            self.log('info', "Email spoof check:")

            # test 1: check if From address is the same as Sender, Reply-To, and Return-Path
            rows = [
                ['Sender', string_clean(msg.get("Sender"))],
                ['From', string_clean(msg.get("From"))],
                ['Reply-To', string_clean(msg.get("Reply-To"))],
                ['Return-Path', string_clean(msg.get("Return-Path"))]
            ]
            self.log('table', dict(header=['Key', 'Value'], rows=rows))
            addr = {
                'Sender': email.utils.parseaddr(string_clean(msg.get("Sender")))[1],
                'From': email.utils.parseaddr(string_clean(msg.get("From")))[1],
                'Reply-To': email.utils.parseaddr(string_clean(msg.get("Reply-To")))[1],
                'Return-Path': email.utils.parseaddr(string_clean(msg.get("Return-Path")))[1]
            }
            if (addr['From'] == ''):
                self.log('error', "No From address!")
                return
            elif addr['Sender'] and (addr['From'] != addr['Sender']):
                self.log('warning', "Email FAILED: From address different than Sender")
            elif addr['Reply-To'] and (addr['From'] != addr['Reply-To']):
                self.log('warning', "Email FAILED: From address different than Reply-To")
            elif addr['Return-Path'] and (addr['From'] != addr['Return-Path']):
                self.log('warning', "Email FAILED: From address different than Return-Path")
            else:
                self.log('success', "Email PASSED: From address the same as Sender, Reply-To, and Return-Path")

            # test 2: check to see if first Received: by domain matches sender MX domain
            if not dnsenabled:
                self.log('info', "Unable to run Received by / sender check without dnspython available")
            else:
                r = msg.get_all('Received')[-1]
                m = re.search("by\s+(\S*?)(?:\s+\(.*?\))?\s+with", r)
                if not m:
                    self.log('error', "Received header regex didn't match")
                    return
                byname = m.group(1)
                # this can be either a name or an IP
                m = re.search("(\w+\.\w+|\d+\.\d+\.\d+\.\d+)$", byname)
                if not m:
                    self.log('error', "Could not find domain or IP in Received by field")
                    return
                bydomain = m.group(1)
                domains = [['Received by', bydomain]]
                # if it's an IP, do the reverse lookup
                m = re.search("\.\d+$", bydomain)
                if m:
                    bydomain = str(dns.reversename.from_address(bydomain)).strip('.')
                    domains.append(['Received by reverse lookup', bydomain])
                # if the email has a Sender header, use that
                if (addr['Sender'] != ""):
                    m = re.search("(\w+\.\w+)$", addr['Sender'])
                    if not m:
                        self.log('error', "Sender header regex didn't match")
                        return
                    fromdomain = m.group(1)
                    domains.append(['Sender', fromdomain])
                # otherwise, use the From header
                else:
                    m = re.search("(\w+\.\w+)$", addr['From'])
                    if not m:
                        self.log('error', "From header regex didn't match")
                        return
                    fromdomain = m.group(1)
                    domains.append(['From', fromdomain])

                bymatch = False
                try:
                    mx = dns.resolver.query(fromdomain, 'MX')
                    if mx :
                        for rdata in mx:
                            m = re.search("(\w+\.\w+).$", str(rdata.exchange))
                            if not m:
                                self.log('error', "MX domain regex didn't match")
                                continue
                            domains.append(['MX for ' + fromdomain, m.group(1)])
                            if bydomain == m.group(1):
                                bymatch = True
                    self.log('table', dict(header=['Key', 'Value'], rows=domains))
                except:
                    domains.append(['MX for ' + fromdomain, "not registered in DNS"])
                    self.log('table', dict(header=['Key', 'Value'], rows=domains))
                if bymatch:
                    self.log('success', "Email PASSED: Received by domain found in Sender/From MX domains")
                else:
                    self.log('warning', "Email FAILED: Could not match Received by domain to Sender/From MX")

            # test 3: look at SPF records
            rspf = []
            results = set()
            allspf = msg.get_all('Received-SPF')
            if not allspf:
                return
            for spf in allspf:
                # self.log('info', string_clean(spf))
                m = re.search("\s*(\w+)\s+\((.*?):\s*(.*?)\)\s+(.*);", string_clean(spf))
                if not m:
                    self.log('error', "Received-SPF regex didn't match")
                    return
                rspf.append([m.group(2), m.group(1), m.group(3), m.group(4)])
                results = results | {m.group(1)}
            self.log('table', dict(header=['Domain', 'Action', 'Info', 'Additional'], rows=rspf))
            if results & {'fail', 'softfail'}:
                self.log('warning', "Email FAILED: Found fail or softfail SPF results")
            elif results & {'none', 'neutral'}:
                self.log('warning', "Email NEUTRAL: Found none or neutral SPF results")
            elif results & {'permerror', 'temperror'}:
                self.log('warning', "Email NEUTRAL: Found error condition")
            elif results & {'pass'}:
                self.log('success', "Email PASSED: Found SPF pass result")

            return

        def email_attachments(msg, ole_flag):
            # Attachments
            att_count = 0
            rows = []
            links = []
            if ole_flag:
                ole = msg
                # Hard part now, each part of the attachment is in a seperate stream

                # need to get a unique stream id for each att
                # its in the streamname as an 8 digit number.
                for i in range(20):  # arbitrary count of emails. i dont expecet this many
                    stream_number = str(i).zfill(8)
                    stream_name = '__attach_version1.0_#' + stream_number
                    # Unicode
                    try:
                        att_filename = ole.openstream(stream_name + '/__substg1.0_3704001F').read()
                        att_mime = ole.openstream(stream_name + '/__substg1.0_370E001F').read()
                        att_data = ole.openstream(stream_name + '/__substg1.0_37010102').read()
                        att_size = len(att_data)
                        att_md5 = hashlib.md5(att_data).hexdigest()
                        rows.append([i, att_filename, att_mime, att_size, att_md5])
                        att_count += 1
                    except:
                        pass
                    # ASCII
                    try:
                        att_filename = ole.openstream(stream_name + '/__substg1.0_3704001E').read()
                        att_mime = ole.openstream(stream_name + '/__substg1.0_370E001E').read()
                        att_data = ole.openstream(stream_name + '/__substg1.0_37010102').read()
                        att_size = len(att_data)
                        att_md5 = hashlib.md5(att_data).hexdigest()
                        rows.append([i, att_filename, att_mime, att_size, att_md5])
                        att_count += 1
                    except:
                        pass

            else:
                # Walk through email string.
                for part in msg.walk():
                    content_type = part.get_content_type()

                    if content_type == 'multipart':
                        continue

                    if content_type in ('text/plain', 'text/html'):
                        part_content = part.get_payload(decode=True)
                        for link in re.findall(r'(https?://[^"<>\s]+)', part_content):
                            if link not in links:
                                links.append(link)

                    if content_type == 'message/rfc822':
                        part_content = part.as_string()
                        m = re.match("Content-Type: message/rfc822\r?\n\r?\n(.*)", part_content, flags=re.S)
                        if not m:
                            self.log('error', "Could not extract RFC822 formatted message")
                            return
                        part_content = m.group(1)
                        att_size = len(part_content)
                        att_file_name = "rfc822msg_{0}.eml".format(att_size)
                        att_md5 = hashlib.md5(part_content).hexdigest()
                        att_count += 1
                        rows.append([att_count, att_file_name, content_type, att_size, att_md5])
                        continue

                    if not part.get('Content-Disposition'):
                        # These are not attachments.
                        continue

                    att_file_name = part.get_filename()
                    att_size = len(part_content)

                    if not att_file_name:
                        continue

                    att_data = part.get_payload(decode=True)
                    att_md5 = hashlib.md5(att_data).hexdigest()
                    att_count += 1
                    rows.append([att_count, att_file_name, part.get_content_type(), att_size, att_md5])

            self.log('info', "Email attachments (total: {0}):".format(att_count))
            if att_count > 0:
                self.log('table', dict(header=['ID', 'FileName', 'Content Type', 'File Size', 'MD5'], rows=rows))

            self.log('info', "Email links:")
            for link in links:
                self.log('item', link)
            return

        # Start Here
        if not __sessions__.is_set():
            self.log('error', "No open session")
            return

        super(EmailParse, self).run(*args)
        if self.args is None:
            return

        # see if we can load the dns library for MX lookup spoof detecton
        try:
            import dns.resolver
            import dns.reversename
            dnsenabled = True
        except ImportError:
            dnsenabled = False

        # Try to open as an ole msg, if not treat as email string
        try:
            ole = olefile.OleFileIO(__sessions__.current.file.path)
            ole_flag = True
        except:
            ole_flag = False
            email_handle = open(__sessions__.current.file.path)
            msg = email.message_from_file(email_handle)
            email_handle.close()

        if self.args.open is not None:
            if ole_flag:
                msg = ole
            att_session(self.args.open, msg, ole_flag)
        elif self.args.envelope:
            if ole_flag:
                msg = parse_ole_msg(ole)
            email_envelope(msg)
        elif self.args.attach:
            if ole_flag:
                msg = ole
            email_attachments(msg, ole_flag)
        elif self.args.header:
            if ole_flag:
                msg = parse_ole_msg(ole)
            email_header(msg)
        elif self.args.trace:
            if ole_flag:
                msg = parse_ole_msg(ole)
            email_trace(msg, False)
        elif self.args.traceall:
            if ole_flag:
                msg = parse_ole_msg(ole)
            email_trace(msg, True)
        elif self.args.spoofcheck:
            if ole_flag:
                msg = parse_ole_msg(ole)
            email_spoofcheck(msg, dnsenabled)
        elif self.args.all:
            if ole_flag:
                msg = parse_ole_msg(ole)
            email_envelope(msg)
            email_header(msg)
            email_trace(msg, True)
            email_spoofcheck(msg, dnsenabled)
            if ole_flag:
                msg = ole
            email_attachments(msg, ole_flag)
        else:
            self.log('error', 'At least one of the parameters is required')
            self.usage()

Example 91

Project: asuka Source File: build.py
Function: install
    def _install(self):
        logger = self.get_logger('install')
        sudo = self.instance.sudo
        logger.info(
            'START TO INSTALL: branch = %r, commit = %r, instance = %r',
            self.branch, self.commit, self.instance
        )
        def setup_instance(service_manifests, service_manifests_available):
            logger = self.get_logger('install.setup_instance')
            with self.instance:
                def aptitude(*commands):
                    sudo(['aptitude', '-y'] + list(commands),
                         environ={'DEBIAN_FRONTEND': 'noninteractive'})
                # create user for app
                sudo(['useradd', '-U', '-G', 'users,www-data', '-Mr',
                      self.app.name])
                # assume instance uses Ubuntu >= 12.04
                apt_sources = re.sub(
                    r'\n#\s*(deb(?:-src)?\s+'
                    r'http://[^.]\.ec2\.archive\.ubuntu\.com/'
                    r'ubuntu/\s+[^-]+multiverse\n)',
                    lambda m: '\n' + m.group(1),
                    self.instance.read_file('/etc/apt/sources.list', sudo=True)
                )
                self.instance.write_file('/etc/apt/sources.list', apt_sources,
                                         sudo=True)
                apt_repos = set()
                apt_packages = set([
                    'build-essential', 'python-dev', 'python-setuptools',
                    'python-pip'
                ])
                with service_manifests_available:
                    while not service_manifests[0]:
                        service_manifests_available.wait()
                for service in service_manifests[1:]:
                    apt_repos.update(service.required_apt_repositories)
                    apt_packages.update(service.required_apt_packages)
                if apt_repos:
                    for repo in apt_repos:
                        sudo(['apt-add-repository', '-y', repo])
                    aptitude('update')
                with self.instance.sftp():
                    self.instance.write_file(
                        '/usr/bin/apt-fast',
                        resource_string(__name__, 'apt-fast'),
                        sudo=True
                    )
                    self.instance.write_file('/etc/apt-fast.conf', '''
_APTMGR=aptitude
DOWNLOADBEFORE=true
_MAXNUM=20
DLLIST='/tmp/apt-fast.list'
_DOWNLOADER='aria2c -c -j ${_MAXNUM} -i ${DLLIST} --connect-timeout=10 \
             --timeout=600 -m0'
DLDIR='/var/cache/apt/archives/apt-fast'
APTCACHE='/var/cache/apt/archives/'
                    ''', sudo=True)
                sudo(['chmod', '+x', '/usr/bin/apt-fast'])
                aptitude('install', 'aria2')
                sudo(['apt-fast', '-q', '-y', 'install'] + list(apt_packages),
                     environ={'DEBIAN_FRONTEND': 'noninteractive'})
        service_manifests_available = threading.Condition()
        service_manifests = [False]
        instance_setup_worker = threading.Thread(
            target=setup_instance,
            kwargs={
                'service_manifests_available': service_manifests_available,
                'service_manifests': service_manifests
            }
        )
        instance_setup_worker.start()
        # setup metadata of the instance
        self.update_instance_metadata()
        self.instance.status = 'started'
        # making package (pybundle)
        fd, package_path = tempfile.mkstemp()
        os.close(fd)
        with self.fetch() as download_path:
            service_manifests.extend(self.services)
            service_manifests[0] = True
            with service_manifests_available:
                service_manifests_available.notify()
            config_temp_path = tempfile.mkdtemp()
            shutil.copytree(
                os.path.join(download_path, self.app.config_dir),
                os.path.join(config_temp_path, self.app.name)
            )
            with self.dist.bundle_package() as (package, filename, temp_path):
                shutil.copyfile(temp_path, package_path)
                remote_path = os.path.join('/tmp', filename)
            with self.instance.sftp():
                # upload config files
                self.instance.put_directory(
                    os.path.join(config_temp_path, self.app.name),
                    '/etc/' + self.app.name,
                    sudo=True
                )
                shutil.rmtree(config_temp_path)
                python_packages = set()
                for service in service_manifests[1:]:
                    python_packages.update(service.required_python_packages)
                # uploads package
                self.instance.put_file(package_path, remote_path)
                # join instance_setup_worker
                instance_setup_worker.join()
                self.instance.status = 'apt-installed'
                pip_cmd = ['pip', 'install', '-i', PYPI_INDEX_URLS[0]]
                for idx in PYPI_INDEX_URLS[1:]:
                    pip_cmd.append('--extra-index-url=' + idx)
                sudo(pip_cmd + [remote_path], environ={'CI': '1'})
                sudo(pip_cmd + ['-I'] + list(python_packages),
                     environ={'CI': '1'})
                self.instance.status = 'installed'
                for service in service_manifests[1:]:
                    for cmd in service.pre_install:
                        sudo(cmd, environ={'DEBIAN_FRONTEND': 'noninteractive'})
                values_path = '/etc/{0}/values.json'.format(self.app.name)
                service_values = {
                    '.build': dict(
                        commit=self.commit.ref,
                        branch=self.branch.label
                    )
                }
                refresh_values = lambda: self.instance.write_file(
                    values_path,
                    json.dumps(service_values),
                    sudo=True
                )
                refresh_values()
                for service in service_manifests[1:]:
                    service_value = service.install(self.instance)
                    service_values[service.name] = service_value
                    refresh_values()
                for service in service_manifests[1:]:
                    for cmd in service.post_install:
                        sudo(cmd, environ={'DEBIAN_FRONTEND': 'noninteractive'})
        service_map = dict((service.name, service)
                           for service in service_manifests[1:])
        deployed_domains = {}
        if self.route53_hosted_zone_id and self.route53_records:
            self.instance.status = 'run'
            changeset = ResourceRecordSets(
                self.app.route53_connection,
                self.route53_hosted_zone_id,
                'Changed by Asuka: {0}, {1} [{2}]'.format(self.app.name,
                                                          self.branch.label,
                                                          self.commit.ref)
            )
            from .service import DomainService
            for service_name, domain_format in self.route53_records.items():
                service = service_map[service_name]
                if not isinstance(service, DomainService):
                    raise TypeError(repr(service) + 'is not an instance of '
                                    'crosspop.service.DomainService')
                domain = domain_format.format(branch=self.branch)
                deployed_domains[service_name] = domain
                service.route_domain(domain, changeset)
                self.instance.tags['Domain-' + service_name] = domain
            if changeset.changes:
                logger.info('Route 53 changeset:\n%s', changeset.to_xml())
                changeset.commit()
        self.instance.status = 'done'
        self.terminate_instances()
        return deployed_domains

Example 92

Project: rpmlint Source File: BinariesCheck.py
    def __init__(self, pkg, path, file, is_ar, is_shlib):
        self.readelf_error = False
        self.needed = []
        self.rpath = []
        self.undef = []
        self.unused = []
        self.comment = False
        self.soname = False
        self.non_pic = True
        self.stack = False
        self.exec_stack = False
        self.exit_calls = []
        self.forbidden_calls = []
        fork_called = False
        self.tail = ''

        self.setgid = False
        self.setuid = False
        self.setgroups = False
        self.chroot = False
        self.chdir = False
        self.chroot_near_chdir = False
        self.mktemp = False

        is_debug = path.endswith('.debug')

        cmd = ['env', 'LC_ALL=C', 'readelf', '-W', '-S', '-l', '-d', '-s']
        cmd.append(path)
        res = Pkg.getstatusoutput(cmd)
        if not res[0]:
            lines = res[1].splitlines()
            for l in lines:
                r = BinaryInfo.needed_regex.search(l)
                if r:
                    self.needed.append(r.group(1))
                    continue

                r = BinaryInfo.rpath_regex.search(l)
                if r:
                    for p in r.group(1).split(':'):
                        self.rpath.append(p)
                    continue

                if BinaryInfo.comment_regex.search(l):
                    self.comment = True
                    continue

                if BinaryInfo.pic_regex.search(l):
                    self.non_pic = False
                    continue

                r = BinaryInfo.soname_regex.search(l)
                if r:
                    self.soname = r.group(1)
                    continue

                r = BinaryInfo.stack_regex.search(l)
                if r:
                    self.stack = True
                    flags = r.group(1)
                    if flags and BinaryInfo.stack_exec_regex.search(flags):
                        self.exec_stack = True
                    continue

                if l.startswith("Symbol table"):
                    break

            for l in lines:
                r = BinaryInfo.call_regex.search(l)
                if not r:
                    continue
                l = r.group(1)

                if BinaryInfo.mktemp_call_regex.search(l):
                    self.mktemp = True

                if BinaryInfo.setgid_call_regex.search(l):
                    self.setgid = True

                if BinaryInfo.setuid_call_regex.search(l):
                    self.setuid = True

                if BinaryInfo.setgroups_call_regex.search(l):
                    self.setgroups = True

                if BinaryInfo.chdir_call_regex.search(l):
                    self.chdir = True

                if BinaryInfo.chroot_call_regex.search(l):
                    self.chroot = True

                if BinaryInfo.forbidden_functions:
                    for r_name, func in BinaryInfo.forbidden_functions.items():
                        ret = func['f_regex'].search(l)
                        if ret:
                            self.forbidden_calls.append(r_name)

                if is_shlib:
                    r = BinaryInfo.exit_call_regex.search(l)
                    if r:
                        self.exit_calls.append(r.group(1))
                        continue
                    r = BinaryInfo.fork_call_regex.search(l)
                    if r:
                        fork_called = True
                        continue

            # check if we don't have a string that will automatically
            # waive the presence of a forbidden call
            if self.forbidden_calls:
                cmd = ['env', 'LC_ALL=C', 'strings']
                cmd.append(path)
                res = Pkg.getstatusoutput(cmd)
                if not res[0]:
                    for l in res[1].splitlines():
                        # as we need to remove elements, iterate backwards
                        for i in range(len(self.forbidden_calls) - 1, -1, -1):
                            func = self.forbidden_calls[i]
                            f = BinaryInfo.forbidden_functions[func]
                            if 'waiver_regex' not in f:
                                continue
                            r = f['waiver_regex'].search(l)
                            if r:
                                del self.forbidden_calls[i]

            if self.non_pic:
                self.non_pic = 'TEXTREL' in res[1]

            # Ignore all exit() calls if fork() is being called.
            # Does not have any context at all but without this kludge, the
            # number of false positives would probably be intolerable.
            if fork_called:
                self.exit_calls = []

            # check if chroot is near chdir (since otherwise, chroot is called
            # without chdir)
            # Currently this implementation works only on x86_64 due to reliance
            # on x86_64 specific assembly. Skip it on other architectures
            if pkg.arch == 'x86_64' and self.chroot and self.chdir:
                p = subprocess.Popen(
                    ['env', 'LC_ALL=C', 'objdump', '-d', path],
                    stdout=subprocess.PIPE, bufsize=-1)
                with p.stdout:
                    index = 0
                    chroot_index = -99
                    chdir_index = -99
                    for line in p.stdout:
                        res = BinaryInfo.objdump_call_regex.search(line)
                        if not res:
                            continue
                        if b'@plt' not in res.group(1):
                            pass
                        elif b'chroot@plt' in res.group(1):
                            chroot_index = index
                            if abs(chroot_index - chdir_index) <= 2:
                                self.chroot_near_chdir = True
                                break
                        elif b'chdir@plt' in res.group(1):
                            chdir_index = index
                            if abs(chroot_index - chdir_index) <= 2:
                                self.chroot_near_chdir = True
                                break
                        index += 1
                if p.wait() and not self.chroot_near_chdir:
                    printWarning(pkg, 'binaryinfo-objdump-failed', file)
                    self.chroot_near_chdir = True  # avoid false positive

        else:
            self.readelf_error = True
            printWarning(pkg, 'binaryinfo-readelf-failed',
                         file, re.sub('\n.*', '', res[1]))

        try:
            with open(path, 'rb') as fobj:
                fobj.seek(-12, os.SEEK_END)
                self.tail = Pkg.b2s(fobj.read())
        except Exception as e:
            printWarning(pkg, 'binaryinfo-tail-failed %s: %s' % (file, e))

        # Undefined symbol and unused direct dependency checks make sense only
        # for installed packages.
        # skip debuginfo: https://bugzilla.redhat.com/190599
        if not is_ar and not is_debug and isinstance(pkg, Pkg.InstalledPkg):
            # We could do this with objdump, but it's _much_ simpler with ldd.
            res = Pkg.getstatusoutput(
                ('env', 'LC_ALL=C', 'ldd', '-d', '-r', path))
            if not res[0]:
                for l in res[1].splitlines():
                    undef = BinaryInfo.undef_regex.search(l)
                    if undef:
                        self.undef.append(undef.group(1))
                if self.undef:
                    cmd = self.undef[:]
                    cmd.insert(0, 'c++filt')
                    try:
                        res = Pkg.getstatusoutput(cmd)
                        if not res[0]:
                            self.undef = res[1].splitlines()
                    except:
                        pass
            else:
                printWarning(pkg, 'ldd-failed', file)
            res = Pkg.getstatusoutput(
                ('env', 'LC_ALL=C', 'ldd', '-r', '-u', path))
            if res[0]:
                # Either ldd doesn't grok -u (added in glibc 2.3.4) or we have
                # unused direct dependencies
                in_unused = False
                for l in res[1].splitlines():
                    if not l.rstrip():
                        pass
                    elif l.startswith('Unused direct dependencies'):
                        in_unused = True
                    elif in_unused:
                        unused = BinaryInfo.unused_regex.search(l)
                        if unused:
                            self.unused.append(unused.group(1))
                        else:
                            in_unused = False

Example 93

Project: rivescript-python Source File: brain.py
    def process_tags(self, user, msg, reply, st=[], bst=[], depth=0, ignore_object_errors=True):
        """Post process tags in a message.

        :param str user: The user ID.
        :param str msg: The user's formatted message.
        :param str reply: The raw RiveScript reply for the message.
        :param []str st: The array of ``<star>`` matches from the trigger.
        :param []str bst: The array of ``<botstar>`` matches from a
            ``%Previous`` command.
        :param int depth: The recursion depth counter.
        :param bool ignore_object_errors: Whether to ignore errors in Python
            object macros instead of raising an ``ObjectError`` exception.

        :return str: The final reply after tags have been processed.
        """
        stars = ['']
        stars.extend(st)
        botstars = ['']
        botstars.extend(bst)
        if len(stars) == 1:
            stars.append("undefined")
        if len(botstars) == 1:
            botstars.append("undefined")

        # Tag shortcuts.
        reply = reply.replace('<person>', '{person}<star>{/person}')
        reply = reply.replace('<@>', '{@<star>}')
        reply = reply.replace('<formal>', '{formal}<star>{/formal}')
        reply = reply.replace('<sentence>', '{sentence}<star>{/sentence}')
        reply = reply.replace('<uppercase>', '{uppercase}<star>{/uppercase}')
        reply = reply.replace('<lowercase>', '{lowercase}<star>{/lowercase}')

        # Weight and <star> tags.
        reply = re.sub(RE.weight, '', reply)  # Leftover {weight}s
        if len(stars) > 0:
            reply = reply.replace('<star>', stars[1])
            reStars = re.findall(RE.star_tags, reply)
            for match in reStars:
                if int(match) < len(stars):
                    reply = reply.replace('<star{match}>'.format(match=match), stars[int(match)])
        if len(botstars) > 0:
            reply = reply.replace('<botstar>', botstars[1])
            reStars = re.findall(RE.botstars, reply)
            for match in reStars:
                if int(match) < len(botstars):
                    reply = reply.replace('<botstar{match}>'.format(match=match), botstars[int(match)])

        # <input> and <reply>
        history = self.master.get_uservar(user, "__history__")
        if type(history) is not dict:
            history = self.default_history()
        reply = reply.replace('<input>', history['input'][0])
        reply = reply.replace('<reply>', history['reply'][0])
        reInput = re.findall(RE.input_tags, reply)
        for match in reInput:
            reply = reply.replace('<input{match}>'.format(match=match),
                                  history['input'][int(match) - 1])
        reReply = re.findall(RE.reply_tags, reply)
        for match in reReply:
            reply = reply.replace('<reply{match}>'.format(match=match),
                                  history['reply'][int(match) - 1])

        # <id> and escape codes.
        reply = reply.replace('<id>', user)
        reply = reply.replace('\\s', ' ')
        reply = reply.replace('\\n', "\n")
        reply = reply.replace('\\#', '#')

        # Random bits.
        reRandom = re.findall(RE.random_tags, reply)
        for match in reRandom:
            output = ''
            if '|' in match:
                output = random.choice(match.split('|'))
            else:
                output = random.choice(match.split(' '))
            reply = reply.replace('{{random}}{match}{{/random}}'.format(match=match), output)

        # Person Substitutions and String Formatting.
        for item in ['person', 'formal', 'sentence', 'uppercase',  'lowercase']:
            matcher = re.findall(r'\{' + item + r'\}(.+?)\{/' + item + r'\}', reply)
            for match in matcher:
                output = None
                if item == 'person':
                    # Person substitutions.
                    output = self.substitute(match, "person")
                else:
                    output = utils.string_format(match, item)
                reply = reply.replace('{{{item}}}{match}{{/{item}}}'.format(item=item, match=match), output)

        # Handle all variable-related tags with an iterative regex approach,
        # to allow for nesting of tags in arbitrary ways (think <set a=<get b>>)
        # Dummy out the <call> tags first, because we don't handle them right
        # here.
        reply = reply.replace("<call>", "{__call__}")
        reply = reply.replace("</call>", "{/__call__}")
        while True:
            # This regex will match a <tag> which contains no other tag inside
            # it, i.e. in the case of <set a=<get b>> it will match <get b> but
            # not the <set> tag, on the first pass. The second pass will get the
            # <set> tag, and so on.
            match = re.search(RE.tag_search, reply)
            if not match: break  # No remaining tags!

            match = match.group(1)
            parts  = match.split(" ", 1)
            tag    = parts[0].lower()
            data   = parts[1] if len(parts) > 1 else ""
            insert = ""  # Result of the tag evaluation

            # Handle the tags.
            if tag == "bot" or tag == "env":
                # <bot> and <env> tags are similar.
                target = self.master._var if tag == "bot" else self.master._global
                if "=" in data:
                    # Setting a bot/env variable.
                    parts = data.split("=")
                    self.say("Set " + tag + " variable " + text_type(parts[0]) + "=" + text_type(parts[1]))
                    target[parts[0]] = parts[1]
                else:
                    # Getting a bot/env variable.
                    insert = target.get(data, "undefined")
            elif tag == "set":
                # <set> user vars.
                parts = data.split("=")
                self.say("Set uservar " + text_type(parts[0]) + "=" + text_type(parts[1]))
                self.master.set_uservar(user, parts[0], parts[1])
            elif tag in ["add", "sub", "mult", "div"]:
                # Math operator tags.
                parts = data.split("=")
                var   = parts[0]
                value = parts[1]
                curv  = self.master.get_uservar(user, var)

                # Sanity check the value.
                try:
                    value = int(value)
                    if curv in [None, "undefined"]:
                        # Initialize it.
                        curv = 0
                except:
                    insert = "[ERR: Math can't '{}' non-numeric value '{}']".format(tag, value)

                # Attempt the operation.
                try:
                    orig = int(curv)
                    new  = 0
                    if tag == "add":
                        new = orig + value
                    elif tag == "sub":
                        new = orig - value
                    elif tag == "mult":
                        new = orig * value
                    elif tag == "div":
                        new = orig / value
                    self.master.set_uservar(user, var, new)
                except:
                    insert = "[ERR: Math couldn't '{}' to value '{}']".format(tag, curv)
            elif tag == "get":
                insert = self.master.get_uservar(user, data)
            else:
                # Unrecognized tag.
                insert = "\x00{}\x01".format(match)

            reply = reply.replace("<{}>".format(match), text_type(insert))

        # Restore unrecognized tags.
        reply = reply.replace("\x00", "<").replace("\x01", ">")

        # Streaming code. DEPRECATED!
        if '{!' in reply:
            self._warn("Use of the {!...} tag is deprecated and not supported here.")

        # Topic setter.
        reTopic = re.findall(RE.topic_tag, reply)
        for match in reTopic:
            self.say("Setting user's topic to " + match)
            self.master.set_uservar(user, "topic", match)
            reply = reply.replace('{{topic={match}}}'.format(match=match), '')

        # Inline redirecter.
        reRedir = re.findall(RE.redir_tag, reply)
        for match in reRedir:
            self.say("Redirect to " + match)
            at = match.strip()
            subreply = self._getreply(user, at, step=(depth + 1))
            reply = reply.replace('{{@{match}}}'.format(match=match), subreply)

        # Object caller.
        reply = reply.replace("{__call__}", "<call>")
        reply = reply.replace("{/__call__}", "</call>")
        reCall = re.findall(r'<call>(.+?)</call>', reply)
        for match in reCall:
            parts  = re.split(RE.ws, match)
            output = ''
            obj    = parts[0]
            args   = []
            if len(parts) > 1:
                args = parts[1:]

            # Do we know this object?
            if obj in self.master._objlangs:
                # We do, but do we have a handler for that language?
                lang = self.master._objlangs[obj]
                if lang in self.master._handlers:
                    # We do.
                    try:
                        output = self.master._handlers[lang].call(self.master, obj, user, args)
                    except python.PythonObjectError as e:
                        self.warn(str(e))
                        if not ignore_object_errors:
                            raise ObjectError(str(e))
                        output = RS_ERR_OBJECT
                else:
                    if not ignore_object_errors:
                        raise ObjectError(RS_ERR_OBJECT_HANDLER)
                    output = RS_ERR_OBJECT_HANDLER
            else:
                if not ignore_object_errors:
                    raise ObjectError(RS_ERR_OBJECT_MISSING)
                output = RS_ERR_OBJECT_MISSING

            reply = reply.replace('<call>{match}</call>'.format(match=match), output)

        return reply

Example 94

Project: ewrt Source File: phonetics.py
Function: metaphone
def metaphone (term):
    "returns metaphone code for a given string"

    # implementation of the original algorithm from Lawrence Philips
    # extended/rewritten by M. Kuhn
    # improvements with thanks to John Machin <[email protected]>

    # define return value
    code = ""

    i = 0
    term_length = len(term)

    if (term_length == 0):
        # empty string ?
        return code
    # end if

    # extension #1 (added 2005-01-28)
    # convert to lowercase
    term = term.lower()

    # extension #2 (added 2005-01-28)
    # remove all non-english characters, first
    term = re.sub(r'[^a-z]', '', term)
    if len(term) == 0:
        # nothing left
        return code
    # end if

    # extension #3 (added 2005-01-24)
    # conflate repeated letters
    firstChar = term[0]
    str2 = firstChar
    for x in term:
        if x != str2[-1]:
               str2 = str2 + x
        # end if
    # end for

    # extension #4 (added 2005-01-24)
    # remove any vowels unless a vowel is the first letter
    firstChar = str2[0]
    str3 = firstChar
    for x in str2[1:]:
        if (re.search(r'[^aeiou]', x)):
            str3 = str3 + x
        # end if
    # end for

    term = str3
    term_length = len(term)
    if term_length == 0:
        # nothing left
        return code
    # end if

    # check for exceptions
    if (term_length > 1):
        # get first two characters
        first_chars = term[0:2]

        # build translation table
        table = {
            "ae":"e",
            "gn":"n",
            "kn":"n",
            "pn":"n",
            "wr":"n",
            "wh":"w"
        }

        if first_chars in table.keys():
            term = term[2:]
            code = table[first_chars]
            term_length = len(term)
        # end if

    elif (term[0] == "x"):
        term = ""
        code = "s"
        term_length = 0
    # end if

    # define standard translation table
    st_trans = {
        "b":"b",
        "c":"k",
        "d":"t",
        "g":"k",
        "h":"h",
        "k":"k",
        "p":"p",
        "q":"k",
        "s":"s",
        "t":"t",
        "v":"f",
        "w":"w",
        "x":"ks",
        "y":"y",
        "z":"s"
    }

    i = 0
    while (i<term_length):
        # init character to add, init basic patterns
        add_char = ""
        part_n_2 = ""
        part_n_3 = ""
        part_n_4 = ""
        part_c_2 = ""
        part_c_3 = ""

        # extract a number of patterns, if possible
        if (i < (term_length - 1)):
            part_n_2 = term[i:i+2]

            if (i>0):
                part_c_2 = term[i-1:i+1]
                part_c_3 = term[i-1:i+2]
            # end if
        # end if

        if (i < (term_length - 2)):
            part_n_3 = term[i:i+3]
        # end if

        if (i < (term_length - 3)):
            part_n_4 = term[i:i+4]
        # end if

        # use table with conditions for translations
        if (term[i] == "b"):
            add_char = st_trans["b"]
            if (i == (term_length - 1)):
                if (i>0):
                    if (term[i-1] == "m"):
                        add_char = ""
                    # end if
                # end if
            # end if
        elif (term[i] == "c"):
            add_char = st_trans["c"]
            if (part_n_2 == "ch"):
                add_char = "x"
            elif (re.search(r'c[iey]', part_n_2)):
                add_char = "s"
            # end if

            if (part_n_3 == "cia"):
                add_char = "x"
            # end if

            if (re.search(r'sc[iey]', part_c_3)):
                add_char = ""
            # end if

        elif (term[i] == "d"):
            add_char = st_trans["d"]
            if (re.search(r'dg[eyi]', part_n_3)):
                add_char = "j"
            # end if

        elif (term[i] == "g"):
            add_char = st_trans["g"]

            if (part_n_2 == "gh"):
                if (i == (term_length - 2)):
                    add_char = ""
                # end if
            elif (re.search(r'gh[aeiouy]', part_n_3)):
                add_char = ""
            elif (part_n_2 == "gn"):
                add_char = ""
            elif (part_n_4 == "gned"):
                add_char = ""
            elif (re.search(r'dg[eyi]',part_c_3)):
                add_char = ""
            elif (part_n_2 == "gi"):
                if (part_c_3 != "ggi"):
                    add_char = "j"
                # end if
            elif (part_n_2 == "ge"):
                if (part_c_3 != "gge"):
                    add_char = "j"
                # end if
            elif (part_n_2 == "gy"):
                if (part_c_3 != "ggy"):
                    add_char = "j"
                # end if
            elif (part_n_2 == "gg"):
                add_char = ""
            # end if
        elif (term[i] == "h"):
            add_char = st_trans["h"]
            if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
                add_char = ""
            elif (re.search(r'[csptg]h', part_c_2)):
                add_char = ""
            # end if
        elif (term[i] == "k"):
            add_char = st_trans["k"]
            if (part_c_2 == "ck"):
                add_char = ""
            # end if
        elif (term[i] == "p"):
            add_char = st_trans["p"]
            if (part_n_2 == "ph"):
                add_char = "f"
            # end if
        elif (term[i] == "q"):
            add_char = st_trans["q"]
        elif (term[i] == "s"):
            add_char = st_trans["s"]
            if (part_n_2 == "sh"):
                add_char = "x"
            # end if

            if (re.search(r'si[ao]', part_n_3)):
                add_char = "x"
            # end if
        elif (term[i] == "t"):
            add_char = st_trans["t"]
            if (part_n_2 == "th"):
                add_char = "0"
            # end if

            if (re.search(r'ti[ao]', part_n_3)):
                add_char = "x"
            # end if
        elif (term[i] == "v"):
            add_char = st_trans["v"]
        elif (term[i] == "w"):
            add_char = st_trans["w"]
            if (re.search(r'w[^aeiouy]', part_n_2)):
                add_char = ""
            # end if
        elif (term[i] == "x"):
            add_char = st_trans["x"]
        elif (term[i] == "y"):
            add_char = st_trans["y"]
        elif (term[i] == "z"):
            add_char = st_trans["z"]
        else:
            # alternative
            add_char = term[i]
        # end if

        code = code + add_char
        i += 1
    # end while

    # return metaphone code
    return code

Example 95

Project: TARDIS Source File: TARDIS.py
def main(vulnerability,vulnObject,sourceIP,sourceHost):
	#Create results and working directories
	if not os.path.exists('Results'):
		os.makedirs('Results')
	if not os.path.exists('Working'):
		os.makedirs('Working')
	
	#Make sure the vulnerability is valid
	if vulnerability != "":
		vulnCheck=0
		resultCount=0
		logsource=''
		print("Searching for evidence of \"" + vulnerability + "\"")
		print("  Host: " + sourceIP)
		
		try:
			configFile = 'config.xml'
			tree = ET.parse(configFile)
			root = tree.getroot()
		except:
			sys.exit("Not a valid config XML file")
		for settings in root.findall("./log_source"):
			logsource=settings.text
		cnx = getDBConnector()
		
		
		#check if vulnerability/asset combo exists in assetVulnerability Table
		cursor = cnx.cursor()
		query = ("SELECT count(*) as count from assetVulnerabilities where victim_ip = '" + str(ip2long(sourceIP)) + "' and threat_id = '" + vulnerability + "'")
		
		cursor.execute(query)
		for row in cursor:
			vulnCheck=row[0]
		cursor.close()
		
		if vulnCheck==0:
			#No combination exists, write data to DB
			
			cursor = cnx.cursor()
			add_vulnInstance = ("INSERT INTO assetVulnerabilities "
               "(victim_ip, threat_id, active) "
               "VALUES (%s, %s, %s)")
			vulnData = (ip2long(sourceIP), vulnerability, '1')
			
			# Insert new entry
			cursor.execute(add_vulnInstance , vulnData )
			
			cnx.commit()
			cursor.close()
			cnx.close()
		searchStringResults= findStixObservables.run(vulnerability)
		isExploitFound=False
		searchStringCount=0
		operator=searchStringResults[0]
		numResults=0
		if(searchStringResults[1]=="No search file found"):
			searchResults="0"
			print("  No search file found\n")
		elif(searchStringResults[1]=="No supported observables found"):
			searchResults="0"
			print("  No supported observables found\n")
		else:
			#run  search...
			#search should return number of results
			#Insert source host from arguments
			for entry in searchStringResults:
				if logsource=="splunk":
					if (searchStringCount == 1):
						searchString=entry + " AND (host=\"" + sourceHost + "\" OR s_ip=\"" + sourceIP + "\" OR d_host=\"" + sourceHost + "\")  | fields host, c_ip | fields - _bkt, _cd, _indextime, _kv, _serial, _si, _sourcetype | rename _raw as \"Raw Log\" | rename c_ip as clientip"
						numResults=splunk.searchVulnerability(searchString,vulnerability,sourceIP,sourceHost)
						if (numResults != "0"):
							data = json.load(numResults)
					
					if (operator=="AND"):
						if (searchStringCount > 1):
							resultCount=0
							for result in data["results"]:
								startTime =  dateutil.parser.parse(data["results"][resultCount]["_time"]) + datetime.timedelta(days =- 300)
								endTime =  dateutil.parser.parse(data["results"][resultCount]["_time"]) + datetime.timedelta(days = 300)
								searchString=entry + " AND (host=\"" + sourceHost + "\" OR s_ip=\"" + sourceIP + "\" OR d_host=\"" + sourceHost + "\") | fields host, clientip | fields - _bkt, _cd, _indextime, _kv, _serial, _si, _sourcetype | rename _raw as \"Raw Log\""
								newResults=splunk.searchVulnerabilityTimeRange(searchString,vulnerability,sourceIP,sourceHost,startTime.isoformat(),endTime.isoformat())
								if (newResults != "0"):
									#This is the result from search 1
									newData = json.load(newResults)
									newResultCount=0
									for result in newData["results"]:
										try:
											clientip=newData["results"][newResultCount]["clientip"]
										except:
											clientip="0"
										isExploitFound=True
										#These are the results from any further results proving the AND condition
										cnx = getDBConnector()
										cursor = cnx.cursor()
										query = ("SELECT count(*) as count from attackInventory where victim_ip = '" + str(ip2long(sourceIP)) + "' and threat_id = '" + vulnerability + "' and attack_time = '" + data["results"][resultCount]["_time"] + "'")
										cursor.execute(query)
										for row in cursor:
											logCheck=row[0]
										cursor.close()
										if logCheck==0:
											#Write data to DB
											cursor = cnx.cursor()
											add_logInstance = ("INSERT INTO attackInventory "
																"(victim_ip, attacker_ip, attack_time, attack_log, threat_id) "
																"VALUES (%s, %s, %s, %s, %s)")
											
											logData = (ip2long(sourceIP), ip2long(clientip), newData["results"][newResultCount]["_time"], newData["results"][newResultCount]["Raw Log"], vulnerability)
											# Insert new entry
											cursor.execute(add_logInstance , logData )
											cnx.commit()
											cursor.close()
										cnx.close()
										newResultCount=newResultCount+1
								else:
									newResultCount=0
							if (isExploitFound==True):
								try:
									clientip=data["results"][resultCount]["clientip"]
								except:
									clientip="0"
								cnx = getDBConnector()
								cursor = cnx.cursor()
								query = ("SELECT count(*) as count from attackInventory where victim_ip = '" + str(ip2long(sourceIP)) + "' and threat_id = '" + vulnerability + "' and attack_time = '" + data["results"][resultCount]["_time"] + "'")
								cursor.execute(query)
								for row in cursor:
									logCheck=row[0]
								cursor.close()
								if logCheck==0:
									#Write data to DB
									cursor = cnx.cursor()
									add_logInstance = ("INSERT INTO attackInventory "
														"(victim_ip, attacker_ip, attack_time, attack_log, threat_id) "
														"VALUES (%s, %s, %s, %s, %s)")
									
									logData = (ip2long(sourceIP), ip2long(clientip), data["results"][resultCount]["_time"], data["results"][resultCount]["Raw Log"], vulnerability)
									# Insert new entry
									cursor.execute(add_logInstance , logData )
									cnx.commit()
									cursor.close()
								cnx.close()
								resultCount=newResultCount+1
							else:
								resultCount=newResultCount
					elif (operator=="OR"):
						if (searchStringCount > 0):
							#only keep searching if there are more IOCS to look at...
							if len(searchStringResults)>2:
								searchString=entry + " AND (host=\"" + sourceHost + "\" OR s_ip=\"" + sourceIP + "\" OR d_host=\"" + sourceHost + "\")  | fields host, clientip | fields - _bkt, _cd, _indextime, _kv, _serial, _si, _sourcetype | rename _raw as \"Raw Log\""
								numResults=splunk.searchVulnerability(searchString,vulnerability,sourceIP,sourceHost)
								if (numResults != "0"):
									data = json.load(numResults)
									resultCount=0
									for result in data["results"]:
										isExploitFound=True
										cnx = getDBConnector()
										cursor = cnx.cursor()
										query = ("SELECT count(*) as count from attackInventory where victim_ip = '" + str(ip2long(sourceIP)) + "' and threat_id = '" + vulnerability + "' and attack_time = '" + data["results"][resultCount]["_time"] + "'")
										cursor.execute(query)
										for row in cursor:
											logCheck=row[0]
										cursor.close()
										if logCheck==0:
											#Write data to DB
											cursor = cnx.cursor()
											add_logInstance = ("INSERT INTO attackInventory "
																"(victim_ip, attacker_ip, attack_time, attack_log, threat_id) "
																"VALUES (%s, %s, %s, %s, %s)")
											logData = (ip2long(sourceIP), ip2long(data["results"][resultCount]["clientip"]), data["results"][resultCount]["_time"], data["results"][resultCount]["Raw Log"], vulnerability)
											
											# Insert new entry
											cursor.execute(add_logInstance , logData )
											
											cnx.commit()
											cursor.close()
										cnx.close()
										resultCount=resultCount+1
							elif len(searchStringResults)==2:
								searchString=entry + " AND (host=\"" + sourceHost + "\" OR host=\"" + sourceIP + "\" OR s_ip=\"" + sourceIP + "\" OR d_host=\"" + sourceHost + "\")  | fields host, clientip | fields - _bkt, _cd, _indextime, _kv, _serial, _si, _sourcetype | rename _raw as \"Raw Log\""
								numResults=splunk.searchVulnerability(searchString,vulnerability,sourceIP,sourceHost)
								if (numResults != "0"):
									data = json.load(numResults)
									resultCount=0
									for result in data["results"]:
										isExploitFound=True
										cnx = getDBConnector()
										cursor = cnx.cursor()
										query = ("SELECT count(*) as count from attackInventory where victim_ip = '" + str(ip2long(sourceIP)) + "' and threat_id = '" + vulnerability + "' and attack_time = '" + data["results"][resultCount]["_time"] + "'")
										cursor.execute(query)
										for row in cursor:
											logCheck=row[0]
										cursor.close()
										if logCheck==0:
											#Write data to DB
											cursor = cnx.cursor()
											add_logInstance = ("INSERT INTO attackInventory "
																"(victim_ip, attacker_ip, attack_time, attack_log, threat_id) "
																"VALUES (%s, %s, %s, %s, %s)")
											
											logData = (ip2long(sourceIP), ip2long(data["results"][resultCount]["clientip"]), data["results"][resultCount]["_time"], data["results"][resultCount]["Raw Log"], vulnerability)
											
											# Insert new entry
											cursor.execute(add_logInstance , logData )
											
											cnx.commit()
											cursor.close()
										cnx.close()
										resultCount=resultCount+1
					searchStringCount=searchStringCount+1
				elif logsource=="elastic_search":
					numResults=0
					startTime="-90d"
					endTime="now"
					#Insert source host from arguments
					entry = re.sub('\<source_host\>', sourceHost, entry)
					#Insert source IP from arguments
					entry = re.sub('\<source_ip\>', sourceIP, entry)
					if (searchStringCount == 1):
						#Insert startTime
						entry = re.sub('\<startTime\>', startTime, entry)
						#Insert endTime
						entry = re.sub('\<endTime\>', endTime, entry)
						if sourceIP == '*':
							entry = re.sub('\<min_count\>', '1', entry)
						else:
							entry = re.sub('\<min_count\>', '2', entry)
						#print entry
						searchResults = ElasticSearchQuery.searchVulnerability(entry,vulnerability,sourceIP,sourceHost)
						#print searchResults
						numResults = getElasticSearchResults(searchResults)
						#print numResults
					if (operator=="AND"):
						if (searchStringCount > 1):
							resultCount=0
							for hit in searchResults['hits']['hits']:
								startTime =  dateutil.parser.parse(hit["_source"]["@timestamp"]) + datetime.timedelta(days =- 1)
								
								endTime =  dateutil.parser.parse(hit["_source"]["@timestamp"]) + datetime.timedelta(days = 1)
								#Insert start time
								entry = re.sub('\<startTime\>', str(startTime.isoformat()), entry)
								#Insert end time
								entry = re.sub('\<endTime\>', str(endTime.isoformat()), entry)
								newSearchResults = ElasticSearchQuery.searchVulnerability(entry,vulnerability,sourceIP,sourceHost)
								newResults = getElasticSearchResults(newSearchResults)
								if (newResults != "0"):
									#This is the result from search 1
									newResultCount=0
									isExploitFound=True
									for newhit in newSearchResults['hits']['hits']:
										try:
											attackerIP=newhit["_source"]["evt_srcip"]
										except:
											attackerIP="0.0.0.0"
										#These are the results from any further results proving the AND condition
										cnx = getDBConnector()
										cursor = cnx.cursor()
										#Check original log hit
										query = ("SELECT count(*) as count from attackInventory where victim_ip = '" + str(ip2long(sourceIP)) + "' and threat_id = '" + vulnerability + "' and attack_log = '" + newhit["_source"]["message"] + "'")
										cursor.execute(query)
										for row in cursor:
											logCheck=row[0]
										cursor.close()
										if logCheck==0:
											#Write data to DB
											cursor = cnx.cursor()
											add_logInstance = ("INSERT INTO attackInventory "
																"(victim_ip, attacker_ip, attack_time, attack_log, threat_id) "
																"VALUES (%s, %s, %s, %s, %s)")
											
											logData = (ip2long(sourceIP), ip2long(attackerIP),hit["_source"]["@timestamp"], hit["_source"]["message"], vulnerability)
											# Insert new entry
											cursor.execute(add_logInstance , logData )
										cursor = cnx.cursor()
										#check new log hit
										query = ("SELECT count(*) as count from attackInventory where victim_ip = '" + str(ip2long(sourceIP)) + "' and threat_id = '" + vulnerability + "' and attack_log = '" + newhit["_source"]["message"] + "'")
										cursor.execute(query)
										for row in cursor:
											logCheck=row[0]
										cursor.close()
										if logCheck==0:
											#Write data to DB
											cursor = cnx.cursor()
											add_logInstance = ("INSERT INTO attackInventory "
																"(victim_ip, attacker_ip, attack_time, attack_log, threat_id) "
																"VALUES (%s, %s, %s, %s, %s)")
											
											logData = (ip2long(sourceIP), ip2long(attackerIP),newhit["_source"]["@timestamp"], newhit["_source"]["message"], vulnerability)
											# Insert new entry
											cursor.execute(add_logInstance , logData )
											
											cnx.commit()
											cursor.close()
										cnx.close()
										newResultCount=newResultCount+1
								else:
									newResultCount=0
								resultCount=newResultCount+1
								
								
								
					elif (operator=="OR"):
						if (searchStringCount == 1):
							if (int(numResults) > 0):
								resultCount = int(numResults)
								writeElasticSearchResults(searchResults,vulnObject,sourceIP,vulnerability)
								isExploitFound=True
						if (searchStringCount > 1):
							#Insert startTime
							entry = re.sub('\<startTime\>', startTime, entry)
							#Insert endTime
							entry = re.sub('\<endTime\>', endTime, entry)
							if sourceIP == '*':
								entry = re.sub('\<min_count\>', '1', entry)
							else:
								entry = re.sub('\<min_count\>', '2', entry)
							#only keep searching if there are more IOCS to look at...
							if len(searchStringResults)>1:
								searchResults = ElasticSearchQuery.searchVulnerability(entry,vulnerability,sourceIP,sourceHost)
								numResults = getElasticSearchResults(searchResults)
								if int(numResults) > 0:
									writeElasticSearchResults(searchResults,vulnObject,sourceIP,vulnerability)
								resultCount = resultCount + int(numResults)
					searchStringCount=searchStringCount+1
			if (isExploitFound==True):
				print("  Found " + str(resultCount) + " instances of exploitation!")
				print("  Generating attack logs") 
				#Parse through data list to get elastic timestamp for audit log times...
			else:
				print("  No instances of exploitation found.\n")
	else:
		resultCount=0
		print("Invalid vulnerability ID")
	return(resultCount)

Example 96

Project: pyfrc Source File: cli_deploy.py
    def run(self, options, robot_class, **static_options):
        
        from .. import config
        config.mode = 'upload'
        
        # run the test suite before uploading
        if not options.skip_tests:
            from .cli_test import PyFrcTest
            
            tester = PyFrcTest()
            
            retval = tester.run_test([], robot_class, options.builtin, ignore_missing_test=True)
            if retval != 0:
                print_err("ERROR: Your robot tests failed, aborting upload.")
                if not sys.stdin.isatty():
                    print_err("- Use --skip-tests if you want to upload anyways")
                    return retval
                
                print()
                if not yesno('- Upload anyways?'):
                    return retval
                
                if not yesno('- Are you sure? Your robot code may crash!'):
                    return retval
                
                print()
                print("WARNING: Uploading code against my better judgement...")
        
        # upload all files in the robot.py source directory
        robot_file = abspath(inspect.getfile(robot_class))
        robot_path = dirname(robot_file)
        robot_filename = basename(robot_file)
        cfg_filename = join(robot_path, '.deploy_cfg')
        
        if not options.nonstandard and robot_filename != 'robot.py':
            print_err("ERROR: Your robot code must be in a file called robot.py (launched from %s)!" % robot_filename)
            print_err()
            print_err("If you really want to do this, then specify the --nonstandard argument")
            return 1
        
        # This probably should be configurable... oh well
        
        deploy_dir = '/home/lvuser'
        py_deploy_dir = '%s/py' % deploy_dir
        
        # note below: deployed_cmd appears that it only can be a single line
        
        # In 2015, there were stdout/stderr issues. In 2016, they seem to
        # have been fixed, but need to use -u for it to really work properly
        
        if options.debug:
            deployed_cmd = 'env LD_LIBRARY_PATH=/usr/local/frc/rpath-lib/ /usr/local/frc/bin/netconsole-host /usr/local/bin/python3 -u %s/%s -v run' % (py_deploy_dir, robot_filename)
            deployed_cmd_fname = 'robotDebugCommand'
            extra_cmd = 'touch /tmp/frcdebug; chown lvuser:ni /tmp/frcdebug'
        else:
            deployed_cmd = 'env LD_LIBRARY_PATH=/usr/local/frc/rpath-lib/ /usr/local/frc/bin/netconsole-host /usr/local/bin/python3 -u -O %s/%s run' % (py_deploy_dir, robot_filename)
            deployed_cmd_fname = 'robotCommand'
            extra_cmd = ''

        if options.in_place:
            del_cmd = ''
        else:
            del_cmd = "[ -d %(py_deploy_dir)s ] && rm -rf %(py_deploy_dir)s"

        del_cmd %= {"py_deploy_dir": py_deploy_dir}
        
        check_version = '/usr/local/bin/python3 -c "exec(open(\\"$SITEPACKAGES/wpilib/version.py\\", \\"r\\").read(), globals()); print(\\"WPILib version on robot is \\" + __version__);exit(0) if __version__ == \\"%s\\" else exit(89)"' % wpilib.__version__
        if options.no_version_check:
            check_version = ''
        
        # This is a nasty bit of code now...
        sshcmd = inspect.cleandoc("""
            /bin/bash -ce '[ -x /usr/local/bin/python3 ] || exit 87
            SITEPACKAGES=$(/usr/local/bin/python3 -c "import site; print(site.getsitepackages()[0])")
            [ -f $SITEPACKAGES/wpilib/version.py ] || exit 88
            %(check_version)s
            %(del_cmd)s
            echo "%(cmd)s" > %(deploy_dir)s/%(cmd_fname)s
            %(extra_cmd)s'
        """)
              
        sshcmd %= {
            'del_cmd': del_cmd,
            'deploy_dir': deploy_dir,
            'cmd': deployed_cmd,
            'cmd_fname': deployed_cmd_fname,
            'extra_cmd': extra_cmd,
            'check_version': check_version
        }
        
        sshcmd = re.sub("\n+", ";", sshcmd)
        
        nc_thread = None
        
        try:
            controller = installer.ssh_from_cfg(cfg_filename,
                                                username='lvuser',
                                                password='',
                                                hostname=options.robot,
                                                allow_mitm=True,
                                                no_resolve=options.no_resolve)
            
            # Housekeeping first
            logger.debug('SSH: %s', sshcmd)
            controller.ssh(sshcmd)
            
            # Copy the files over, copy to a temporary directory first
            # -> this is inefficient, but it's easier in sftp
            tmp_dir = tempfile.mkdtemp()
            py_tmp_dir = join(tmp_dir, 'py')
                    
            try:
                self._copy_to_tmpdir(py_tmp_dir, robot_path)
                controller.sftp(py_tmp_dir, deploy_dir, mkdir=not options.in_place)
            finally:
                shutil.rmtree(tmp_dir)
            
            # start the netconsole listener now if requested, *before* we
            # actually start the robot code, so we can see all messages
            if options.nc:
                from netconsole import run
                nc_event = threading.Event()
                nc_thread = threading.Thread(target=run,
                                             kwargs={'init_event': nc_event},
                                             daemon=True)
                nc_thread.start()
                nc_event.wait(5)
                logger.info("Netconsole is listening...")
            
            if not options.in_place:
                # Restart the robot code and we're done!
                sshcmd = "/bin/bash -ce '" + \
                         '. /etc/profile.d/natinst-path.sh; ' + \
                         'chown -R lvuser:ni %s; ' + \
                         '/usr/local/frc/bin/frcKillRobot.sh -t -r' + \
                         "'"
            
                sshcmd %= (py_deploy_dir)
            
                logger.debug('SSH: %s', sshcmd)
                controller.ssh(sshcmd)
            
        except installer.SshExecError as e:
            if e.retval == 87:
                print_err("ERROR: python3 was not found on the roboRIO: have you installed robotpy?")
            elif e.retval == 88:
                print_err("ERROR: WPILib was not found on the roboRIO: have you installed robotpy?")
            elif e.retval == 89:
                print_err("ERROR: expected WPILib version %s" % wpilib.__version__)
                print_err()
                print_err("You should either:")
                print_err("- If the robot version is older, upgrade the RobotPy on your robot")
                print_err("- Otherwise, upgrade pyfrc on your computer")
                print_err()
                print_err("Alternatively, you can specify --no-version-check to skip this check")
            else:
                print_err("ERROR: %s" % e)
            return 1
        except installer.Error as e:
            print_err("ERROR: %s" % e)
            return 1
        else:
            print("\nSUCCESS: Deploy was successful!")
        
        if nc_thread is not None:
            nc_thread.join()
        
        return 0

Example 97

Project: khard Source File: config.py
Function: init
        def __init__(self):
            self.config = None
            self.address_book_list = []
            self.original_uid_dict = {}
            self.uid_dict = {}

            # set locale
            locale.setlocale(locale.LC_ALL, '')

            # load config file
            xdg_config_home = os.environ.get("XDG_CONFIG_HOME") or \
                os.path.expanduser("~/.config")
            config_file = os.environ.get("KHARD_CONFIG") or \
                os.path.join(xdg_config_home, "khard", "khard.conf")
            if not os.path.exists(config_file):
                print("Config file %s not available" % config_file)
                sys.exit(2)

            # parse config file contents
            try:
                self.config = configobj.ConfigObj(
                    config_file, interpolation=False)
            except configobj.ParseError as err:
                print("Error in config file\n%s" % err)
                sys.exit(2)

            # general settings
            if "general" not in self.config:
                print('Error in config file\n'
                      'Missing main section "[general]".')
                sys.exit(2)

            # debug
            if 'debug' not in self.config['general']:
                self.config['general']['debug'] = False
            elif self.config['general']['debug'] == "yes":
                self.config['general']['debug'] = True
            elif self.config['general']['debug'] == "no":
                self.config['general']['debug'] = False
            else:
                print("Error in config file\n"
                      "Invalid value for debug parameter\n"
                      "Possible values: yes, no")
                sys.exit(2)

            # editor
            self.config['general']['editor'] = \
                self.config['general'].get("editor") \
                or os.environ.get("EDITOR")
            if self.config['general']['editor'] is None:
                print("Error in config file\n"
                      "Set path to your preferred text editor in khard's "
                      "config file or the $EDITOR shell variable\n"
                      "Example for khard.conf: editor = vim")
                sys.exit(2)
            self.config['general']['editor'] = find_executable(
                os.path.expanduser(self.config['general']['editor']))
            if self.config['general']['editor'] is None:
                print("Error in config file\n"
                      "Invalid editor path or executable not found.")
                sys.exit(2)

            # merge editor
            self.config['general']['merge_editor'] = \
                self.config['general'].get("merge_editor") \
                or os.environ.get("MERGE_EDITOR")
            if self.config['general']['merge_editor'] is None:
                print("Error in config file\nSet path to your preferred text "
                      "merge editor in khard's config file or the "
                      "$MERGE_EDITOR shell variable\n"
                      "Example for khard.conf: merge_editor = vimdiff")
                sys.exit(2)
            self.config['general']['merge_editor'] = find_executable(
                os.path.expanduser(self.config['general']['merge_editor']))
            if self.config['general']['merge_editor'] is None:
                print("Error in config file\n"
                      "Invalid merge editor path or executable not found.")
                sys.exit(2)

            # default action
            if "default_action" not in self.config['general']:
                print("Error in config file\n"
                      "Missing default action parameter.")
                sys.exit(2)
            elif self.config['general']['default_action'] not in \
                    Actions.get_list_of_all_actions():
                print("Error in config file\nInvalid value for default_action "
                      "parameter\nPossible values: %s" % ', '.join(
                          sorted(Actions.get_list_of_all_actions())))
                sys.exit(2)

            # contact table settings
            if "contact table" not in self.config:
                self.config['contact table'] = {}

            # sort contact table by first or last name
            if "sort" not in self.config['contact table']:
                self.config['contact table']['sort'] = "first_name"
            elif self.config['contact table']['sort'] not in \
                    ["first_name", "last_name"]:
                print("Error in config file\n"
                      "Invalid value for sort parameter\n"
                      "Possible values: first_name, last_name")
                sys.exit(2)

            # display names in contact table by first or last name
            if "display" not in self.config['contact table']:
                # if display by name attribute is not present in the config
                # file use the sort attribute value for backwards compatibility
                self.config['contact table']['display'] = \
                        self.config['contact table']['sort']
            elif self.config['contact table']['display'] not in \
                    ["first_name", "last_name"]:
                print("Error in config file\n"
                      "Invalid value for display parameter\n"
                      "Possible values: first_name, last_name")
                sys.exit(2)

            # reverse contact table
            if 'reverse' not in self.config['contact table']:
                self.config['contact table']['reverse'] = False
            elif self.config['contact table']['reverse'] == "yes":
                self.config['contact table']['reverse'] = True
            elif self.config['contact table']['reverse'] == "no":
                self.config['contact table']['reverse'] = False
            else:
                print("Error in config file\n"
                      "Invalid value for reverse parameter\n"
                      "Possible values: yes, no")
                sys.exit(2)

            # group contact table by address book
            if "group_by_addressbook" not in self.config['contact table']:
                self.config['contact table']['group_by_addressbook'] = False
            elif self.config['contact table']['group_by_addressbook'] == "yes":
                self.config['contact table']['group_by_addressbook'] = True
            elif self.config['contact table']['group_by_addressbook'] == "no":
                self.config['contact table']['group_by_addressbook'] = False
            else:
                print("Error in config file\n"
                      "Invalid value for group_by_addressbook parameter\n"
                      "Possible values: yes, no")
                sys.exit(2)

            # nickname
            if "show_nicknames" not in self.config['contact table']:
                self.config['contact table']['show_nicknames'] = False
            elif self.config['contact table']['show_nicknames'] == "yes":
                self.config['contact table']['show_nicknames'] = True
            elif self.config['contact table']['show_nicknames'] == "no":
                self.config['contact table']['show_nicknames'] = False
            else:
                print("Error in config file\n"
                      "Invalid value for show_nicknames parameter\n"
                      "Possible values: yes, no")
                sys.exit(2)

            # show uids
            if "show_uids" not in self.config['contact table']:
                self.config['contact table']['show_uids'] = True
            elif self.config['contact table']['show_uids'] == "yes":
                self.config['contact table']['show_uids'] = True
            elif self.config['contact table']['show_uids'] == "no":
                self.config['contact table']['show_uids'] = False
            else:
                print("Error in config file\n"
                      "Invalid value for show_uids parameter\n"
                      "Possible values: yes, no")
                sys.exit(2)

            # vcard settings
            if "vcard" not in self.config:
                self.config['vcard'] = {}

            # get supported private objects
            if "private_objects" not in self.config['vcard']:
                self.config['vcard']['private_objects'] = []
            else:
                # check if object only contains letters, digits or -
                for object in self.config['vcard']['private_objects']:
                    if object != re.sub("[^a-zA-Z0-9-]", "", object):
                        print("Error in config file\n"
                              "private object %s may only contain letters, "
                              "digits and the \"-\" character." % object)
                        sys.exit(2)
                    if object == re.sub("[^-]", "", object) \
                            or object.startswith("-") \
                            or object.endswith("-"):
                        print("Error in config file\n"
                              "A \"-\" in a private object label must be "
                              "at least surrounded by one letter or digit.")
                        sys.exit(2)

            # preferred vcard version
            if "preferred_version" not in self.config['vcard']:
                self.config['vcard']['preferred_version'] = "3.0"
            elif self.config['vcard']['preferred_version'] not in \
                    self.get_supported_vcard_versions():
                print("Error in config file\n"
                      "Invalid value for preferred_version parameter\n"
                      "Possible values: %s"
                      % self.get_supported_vcard_versions())
                sys.exit(2)

            # speed up program by pre-searching in the vcard source files
            if 'search_in_source_files' not in self.config['vcard']:
                self.config['vcard']['search_in_source_files'] = False
            elif self.config['vcard']['search_in_source_files'] == "yes":
                self.config['vcard']['search_in_source_files'] = True
            elif self.config['vcard']['search_in_source_files'] == "no":
                self.config['vcard']['search_in_source_files'] = False
            else:
                print("Error in config file\n"
                      "Invalid value for search_in_source_files parameter\n"
                      "Possible values: yes, no")
                sys.exit(2)

            # skip unparsable vcards
            if 'skip_unparsable' not in self.config['vcard']:
                self.config['vcard']['skip_unparsable'] = False
            elif self.config['vcard']['skip_unparsable'] == "yes":
                self.config['vcard']['skip_unparsable'] = True
            elif self.config['vcard']['skip_unparsable'] == "no":
                self.config['vcard']['skip_unparsable'] = False
            else:
                print("Error in config file\n"
                      "Invalid value for skip_unparsable parameter\n"
                      "Possible values: yes, no")
                sys.exit(2)

            # load address books
            if "addressbooks" not in self.config:
                print('Error in config file\n'
                      'Missing main section "[addressbooks]".')
                sys.exit(2)
            if len(self.config['addressbooks'].keys()) == 0:
                print("Error in config file\n"
                      "No address book entries available.")
                sys.exit(2)
            for name in self.config['addressbooks'].keys():
                # create address book object
                try:
                    address_book = AddressBook(
                        name, self.config['addressbooks'][name]['path'])
                except KeyError as e:
                    print("Error in config file\n"
                          "Missing path to the \"%s\" address book." % name)
                    sys.exit(2)
                except IOError as e:
                    print("Error in config file\n%s" % e)
                    sys.exit(2)
                else:
                    # add address book to list
                    self.address_book_list.append(address_book)

Example 98

Project: tahoe-lafs Source File: fixups.py
def initialize():
    global done
    import sys
    if sys.platform != "win32" or done:
        return True
    done = True

    import codecs, re
    from ctypes import WINFUNCTYPE, WinError, windll, POINTER, byref, c_int, get_last_error
    from ctypes.wintypes import BOOL, HANDLE, DWORD, UINT, LPWSTR, LPCWSTR, LPVOID

    from allmydata.util import log
    from allmydata.util.encodingutil import canonical_encoding

    # <https://msdn.microsoft.com/en-us/library/ms680621%28VS.85%29.aspx>
    SetErrorMode = WINFUNCTYPE(
        UINT,  UINT,
        use_last_error=True
    )(("SetErrorMode", windll.kernel32))

    SEM_FAILCRITICALERRORS = 0x0001
    SEM_NOOPENFILEERRORBOX = 0x8000

    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOOPENFILEERRORBOX)

    original_stderr = sys.stderr

    # If any exception occurs in this code, we'll probably try to print it on stderr,
    # which makes for frustrating debugging if stderr is directed to our wrapper.
    # So be paranoid about catching errors and reporting them to original_stderr,
    # so that we can at least see them.
    def _complain(message):
        print >>original_stderr, isinstance(message, str) and message or repr(message)
        log.msg(message, level=log.WEIRD)

    # Work around <http://bugs.python.org/issue6058>.
    codecs.register(lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)

    # Make Unicode console output work independently of the current code page.
    # This also fixes <http://bugs.python.org/issue1602>.
    # Credit to Michael Kaplan <https://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx>
    # and TZOmegaTZIOY
    # <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
    try:
        # <https://msdn.microsoft.com/en-us/library/ms683231(VS.85).aspx>
        # HANDLE WINAPI GetStdHandle(DWORD nStdHandle);
        # returns INVALID_HANDLE_VALUE, NULL, or a valid handle
        #
        # <https://msdn.microsoft.com/en-us/library/aa364960(VS.85).aspx>
        # DWORD WINAPI GetFileType(DWORD hFile);
        #
        # <https://msdn.microsoft.com/en-us/library/ms683167(VS.85).aspx>
        # BOOL WINAPI GetConsoleMode(HANDLE hConsole, LPDWORD lpMode);

        GetStdHandle = WINFUNCTYPE(
            HANDLE,  DWORD,
            use_last_error=True
        )(("GetStdHandle", windll.kernel32))

        STD_OUTPUT_HANDLE = DWORD(-11)
        STD_ERROR_HANDLE  = DWORD(-12)

        GetFileType = WINFUNCTYPE(
            DWORD,  DWORD,
            use_last_error=True
        )(("GetFileType", windll.kernel32))

        FILE_TYPE_CHAR   = 0x0002
        FILE_TYPE_REMOTE = 0x8000

        GetConsoleMode = WINFUNCTYPE(
            BOOL,  HANDLE, POINTER(DWORD),
            use_last_error=True
        )(("GetConsoleMode", windll.kernel32))

        INVALID_HANDLE_VALUE = DWORD(-1).value

        def not_a_console(handle):
            if handle == INVALID_HANDLE_VALUE or handle is None:
                return True
            return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
                    or GetConsoleMode(handle, byref(DWORD())) == 0)

        old_stdout_fileno = None
        old_stderr_fileno = None
        if hasattr(sys.stdout, 'fileno'):
            old_stdout_fileno = sys.stdout.fileno()
        if hasattr(sys.stderr, 'fileno'):
            old_stderr_fileno = sys.stderr.fileno()

        STDOUT_FILENO = 1
        STDERR_FILENO = 2
        real_stdout = (old_stdout_fileno == STDOUT_FILENO)
        real_stderr = (old_stderr_fileno == STDERR_FILENO)

        if real_stdout:
            hStdout = GetStdHandle(STD_OUTPUT_HANDLE)
            if not_a_console(hStdout):
                real_stdout = False

        if real_stderr:
            hStderr = GetStdHandle(STD_ERROR_HANDLE)
            if not_a_console(hStderr):
                real_stderr = False

        if real_stdout or real_stderr:
            # <https://msdn.microsoft.com/en-us/library/windows/desktop/ms687401%28v=vs.85%29.aspx>
            # BOOL WINAPI WriteConsoleW(HANDLE hOutput, LPWSTR lpBuffer, DWORD nChars,
            #                           LPDWORD lpCharsWritten, LPVOID lpReserved);

            WriteConsoleW = WINFUNCTYPE(
                BOOL,  HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID,
                use_last_error=True
            )(("WriteConsoleW", windll.kernel32))

            class UnicodeOutput:
                def __init__(self, hConsole, stream, fileno, name):
                    self._hConsole = hConsole
                    self._stream = stream
                    self._fileno = fileno
                    self.closed = False
                    self.softspace = False
                    self.mode = 'w'
                    self.encoding = 'utf-8'
                    self.name = name
                    if hasattr(stream, 'encoding') and canonical_encoding(stream.encoding) != 'utf-8':
                        log.msg("%s: %r had encoding %r, but we're going to write UTF-8 to it" %
                                (name, stream, stream.encoding), level=log.CURIOUS)
                    self.flush()

                def isatty(self):
                    return False
                def close(self):
                    # don't really close the handle, that would only cause problems
                    self.closed = True
                def fileno(self):
                    return self._fileno
                def flush(self):
                    if self._hConsole is None:
                        try:
                            self._stream.flush()
                        except Exception, e:
                            _complain("%s.flush: %r from %r" % (self.name, e, self._stream))
                            raise

                def write(self, text):
                    try:
                        if self._hConsole is None:
                            if isinstance(text, unicode):
                                text = text.encode('utf-8')
                            self._stream.write(text)
                        else:
                            if not isinstance(text, unicode):
                                text = str(text).decode('utf-8')
                            remaining = len(text)
                            while remaining > 0:
                                n = DWORD(0)
                                # There is a shorter-than-docuemented limitation on the length of the string
                                # passed to WriteConsoleW (see #1232).
                                retval = WriteConsoleW(self._hConsole, text, min(remaining, 10000), byref(n), None)
                                if retval == 0:
                                    raise IOError("WriteConsoleW failed with WinError: %s" % (WinError(get_last_error()),))
                                if n.value == 0:
                                    raise IOError("WriteConsoleW returned %r, n.value = 0" % (retval,))
                                remaining -= n.value
                                if remaining == 0: break
                                text = text[n.value:]
                    except Exception, e:
                        _complain("%s.write: %r" % (self.name, e))
                        raise

                def writelines(self, lines):
                    try:
                        for line in lines:
                            self.write(line)
                    except Exception, e:
                        _complain("%s.writelines: %r" % (self.name, e))
                        raise

            if real_stdout:
                sys.stdout = UnicodeOutput(hStdout, None, STDOUT_FILENO, '<Unicode console stdout>')
            else:
                sys.stdout = UnicodeOutput(None, sys.stdout, old_stdout_fileno, '<Unicode redirected stdout>')

            if real_stderr:
                sys.stderr = UnicodeOutput(hStderr, None, STDERR_FILENO, '<Unicode console stderr>')
            else:
                sys.stderr = UnicodeOutput(None, sys.stderr, old_stderr_fileno, '<Unicode redirected stderr>')
    except Exception, e:
        _complain("exception %r while fixing up sys.stdout and sys.stderr" % (e,))

    # This works around <http://bugs.python.org/issue2128>.

    # <https://msdn.microsoft.com/en-us/library/windows/desktop/ms683156%28v=vs.85%29.aspx>
    GetCommandLineW = WINFUNCTYPE(
        LPWSTR,
        use_last_error=True
    )(("GetCommandLineW", windll.kernel32))

    # <https://msdn.microsoft.com/en-us/library/windows/desktop/bb776391%28v=vs.85%29.aspx>
    CommandLineToArgvW = WINFUNCTYPE(
        POINTER(LPWSTR),  LPCWSTR, POINTER(c_int),
        use_last_error=True
    )(("CommandLineToArgvW", windll.shell32))

    argc = c_int(0)
    argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc))
    if argv_unicode is None:
        raise WinError(get_last_error())

    # Because of <http://bugs.python.org/issue8775> (and similar limitations in
    # twisted), the 'bin/tahoe' script cannot invoke us with the actual Unicode arguments.
    # Instead it "mangles" or escapes them using \x7F as an escape character, which we
    # unescape here.
    def unmangle(s):
        return re.sub(ur'\x7F[0-9a-fA-F]*\;', lambda m: unichr(int(m.group(0)[1:-1], 16)), s)

    try:
        argv = [unmangle(argv_unicode[i]).encode('utf-8') for i in xrange(0, argc.value)]
    except Exception, e:
        _complain("%s:  could not unmangle Unicode arguments.\n%r"
                  % (sys.argv[0], [argv_unicode[i] for i in xrange(0, argc.value)]))
        raise

    # Take only the suffix with the same number of arguments as sys.argv.
    # This accounts for anything that can cause initial arguments to be stripped,
    # for example, the Python interpreter or any options passed to it, or runner
    # scripts such as 'coverage run'. It works even if there are no such arguments,
    # as in the case of a frozen executable created by bb-freeze or similar.

    sys.argv = argv[-len(sys.argv):]
    if sys.argv[0].endswith('.pyscript'):
        sys.argv[0] = sys.argv[0][:-9]

Example 99

Project: pelisalacarta Source File: aquitorrent.py
def fanart(item):
    logger.info("pelisalacarta.aquitorrent fanart")
    itemlist = []
    url = item.url
    data = scrapertools.cache_page(url)
    data = re.sub(r"\n|\r|\t|\s{2}|\(.*?\)|&nbsp;","",data)
    title = item.extra
    
    
    

    year=""
    item.title = re.sub(r"-|\(.*?\)|\d+x\d+","",item.title)
    if not "Series" in item.url:
        urlyear = item.url
        data = scrapertools.cache_page(urlyear)
        try:
            year =scrapertools.get_match(data,'<span style="text-align: justify;">.*?Año.*?(\d\d\d\d)')
        except:
            year = ""
        try:
            
            if "CLASICOS-DISNEY" in item.url:
                title = title + " "+"Disney"
            try:
                ###Busqueda en Tmdb la peli por titulo y año
                title_tmdb = title.replace(" ","%20")
                url_tmdb="http://api.themoviedb.org/3/search/movie?api_key=2e2160006592024ba87ccdf78c28f49f&query=" + title_tmdb +"&year="+year+"&language=es&include_adult=false"
                
                data = scrapertools.cachePage(url_tmdb)
                data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                id = scrapertools.get_match(data,'"page":1.*?,"id":(.*?),')
            
            except:
                if ":" in title or "(" in title:
                    title_tmdb = title.replace(" ","%20")
                    url_tmdb="http://api.themoviedb.org/3/search/movie?api_key=2e2160006592024ba87ccdf78c28f49f&query=" + title_tmdb +"&year="+year+"&language=es&include_adult=false"
                    data = scrapertools.cachePage(url_tmdb)
                    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                    id = scrapertools.get_match(data,'"page":1.*?,"id":(.*?),')
                    
                else:
                    title_tmdb = title.replace(" ","%20")
                    title_tmdb= re.sub(r"(:.*)|\(.*?\)","",title_tmdb)
                    url_tmdb="http://api.themoviedb.org/3/search/movie?api_key=2e2160006592024ba87ccdf78c28f49f&query=" + title_tmdb +"&year="+year+"&language=es&include_adult=false"
                    data = scrapertools.cachePage(url_tmdb)
                    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                    id = scrapertools.get_match(data,'"page":1.*?,"id":(.*?),')



        except:
            ###Si no hay coincidencia realiza busqueda por bing del id Imdb
            urlbing_imdb = "http://www.bing.com/search?q=%s+%s+site:imdb.com" % (title.replace(' ', '+'),  year)
            data = browser (urlbing_imdb)
            
            try:
                subdata_imdb = scrapertools.get_match(data,'<li class="b_algo">(.*?)h="ID')
            except:
              pass
            
            try:
                url_imdb = scrapertools.get_match(subdata_imdb,'<a href="([^"]+)"')
        
            except:
                pass
            try:
                id_imdb = scrapertools.get_match(url_imdb,'.*?www.imdb.com/.*?/(.*?)/')
            except:
                pass
            try:
                ###Busca id Tmdb mediante el id de Imdb
                urltmdb_remote ="https://api.themoviedb.org/3/find/"+id_imdb+"?external_source=imdb_id&api_key=2e2160006592024ba87ccdf78c28f49f&language=es&include_adult=false"
                data = scrapertools.cachePage(urltmdb_remote)
                data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                id = scrapertools.get_match(data,'"movie_results".*?,"id":(\d+)')
                
            except:
                id = ""
                 
        
        ###Llegados aqui ya tenemos(o no) el id(Tmdb);Busca fanart_1
        urltmdb_fan1 ="http://api.themoviedb.org/3/movie/"+id+"?api_key=2e2160006592024ba87ccdf78c28f49f"
        data = scrapertools.cachePage( urltmdb_fan1 )
        data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
        patron = '"adult".*?"backdrop_path":"(.*?)"'
        matches = re.compile(patron,re.DOTALL).findall(data)
        try:
            ###Prueba poster de Tmdb
            posterdb = scrapertools.get_match(data,'"adult".*?"poster_path":"(.*?)"')
            posterdb =  "https://image.tmdb.org/t/p/original" + posterdb
        except:
            posterdb = item.thumbnail
    
        if len(matches)==0:
            fanart_info = item.fanart
            fanart= item.fanart
            fanart_2 = item.fanart
            itemlist.append( Item(channel=item.channel, title =item.title, url=item.url, action="findvideos", thumbnail=posterdb, fanart=fanart ,extra= fanart_2, folder=True) )
        for fan in matches:
    
            fanart="https://image.tmdb.org/t/p/original" + fan
            fanart_1= fanart
            
            ###Busca fanart para info, fanart para trailer y fanart_2(finvideos) en Tmdb
            urltmdb_images ="http://api.themoviedb.org/3/movie/"+id+"/images?api_key=2e2160006592024ba87ccdf78c28f49f"
            data = scrapertools.cachePage(urltmdb_images)
            data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
            
            patron = '"backdrops".*?"file_path":".*?",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)"'
            matches = re.compile(patron,re.DOTALL).findall(data)
            
            if len(matches) == 0:
                patron = '"backdrops".*?"file_path":"(.*?)",.*?"file_path":"(.*?)",.*?"file_path":"(.*?)"'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if len(matches) == 0:
                    fanart_info = fanart_1
                    fanart_trailer = fanart_1
                    fanart_2 = fanart_1
                    category =""
            for fanart_info, fanart_trailer, fanart_2 in matches:
                fanart_info = "https://image.tmdb.org/t/p/original" + fanart_info
                fanart_trailer = "https://image.tmdb.org/t/p/original" + fanart_trailer
                fanart_2 = "https://image.tmdb.org/t/p/original" + fanart_2
                category = ""
                
                if fanart_info == fanart:
                    ###Busca fanart_info en Imdb si coincide con fanart
                    try:
                        url_imdbphoto = "http://www.imdb.com/title/"+id_imdb+"/mediaindex"
                        photo_imdb= scrapertools.get_match(url_imdbphoto,'<div class="media_index_thumb_list".*?src="([^"]+)"')
                        photo_imdb = photo_imdb.replace("@._V1_UY100_CR25,0,100,100_AL_.jpg","@._V1_SX1280_SY720_.jpg")
                        fanart_info = photo_imdb
                    except:
                        fanart_info = fanart_2
            itemlist.append( Item(channel=item.channel, title =item.title, url=item.url, action="findvideos", thumbnail=posterdb, fanart=fanart_1 ,extra= fanart_2, folder=True) )



    else:
        urlyear = item.url
        data = scrapertools.cache_page(urlyear)
        try:
            year =scrapertools.get_match(data,'<span style="text-align: justify;">.*?Año.*?(\d\d\d\d)')
        except:
              try:
                 year =scrapertools.get_match(data,'SINOPSIS.*? \((\d\d\d\d)')
              except:
                 year = ""
        #Busqueda bing de Imdb serie id
        url_imdb = "http://www.bing.com/search?q=%s+%s+tv+series+site:imdb.com" % (title.replace(' ', '+'),  year)
        data = browser (url_imdb)
        data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
        
        try:
            subdata_imdb = scrapertools.get_match(data,'<li class="b_algo">(.*?)h="ID')
        except:
            pass
        try:
            imdb_id = scrapertools.get_match(subdata_imdb,'<a href=.*?http.*?imdb.com/title/(.*?)/.*?"')
        except:
            imdb_id = ""
        ### Busca id de tvdb mediante imdb id
        urltvdb_remote="http://thetvdb.com/api/GetSeriesByRemoteID.php?imdbid="+imdb_id+"&language=es"
        data = scrapertools.cachePage(urltvdb_remote)
        data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
        patron = '<Data><Series><seriesid>([^<]+)</seriesid>.*?<Overview>(.*?)</Overview>'
        matches = re.compile(patron,re.DOTALL).findall(data)
        
        if len(matches)== 0:
            print "gooooooo"
            ###Si no hay coincidencia busca en tvdb directamente
            if ":" in title or "(" in title:
                title= title.replace(" ","%20")
                url_tvdb="http://thetvdb.com/api/GetSeries.php?seriesname=" + title + "&language=es"
                data = scrapertools.cachePage(url_tvdb)
                data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                patron = '<Data><Series><seriesid>([^<]+)</seriesid>.*?<Overview>(.*?)</Overview>'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if len(matches)== 0:
                    title= re.sub(r"(:.*)|\(.*?\)","",title)
                    title= title.replace(" ","%20")
                    url_tvdb="http://thetvdb.com/api/GetSeries.php?seriesname=" + title + "&language=es"
                    data = scrapertools.cachePage(url_tvdb)
                    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                    patron = '<Data><Series><seriesid>([^<]+)</seriesid>.*?<Overview>(.*?)</Overview>'
                    matches = re.compile(patron,re.DOTALL).findall(data)
                        
                    if len(matches) == 0:
                        plot = ""
                        postertvdb = item.thumbnail
                        extra= "http://s6.postimg.org/rv2mu3pap/bityouthsinopsis2.png"
                        fanart_info = "http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg"
                        fanart_trailer = "http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg"
                        category= ""
                        show = title+"|"+year+"|"+"http://s6.postimg.org/mh3umjzkh/bityouthnofanventanuco.jpg"
                        itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="finvideos", thumbnail=item.thumbnail, fanart="http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg" ,extra=extra, category= category,  show=show ,plot=plot, folder=True) )
        
            else:
                title= title.replace(" ","%20")
                url_tvdb="http://thetvdb.com/api/GetSeries.php?seriesname=" + title + "&language=es"
                data = scrapertools.cachePage(url_tvdb)
                data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
                patron = '<Data><Series><seriesid>([^<]+)</seriesid>.*?<Overview>(.*?)</Overview>'
                print "perroooo"
                print patron
                matches = re.compile(patron,re.DOTALL).findall(data)
                print matches
                if len(matches) == 0:
                    plot = ""
                    postertvdb = item.thumbnail
                    extra= "http://s6.postimg.org/rv2mu3pap/bityouthsinopsis2.png"
                    show = title+"|"+year+"|"+"http://s6.postimg.org/mh3umjzkh/bityouthnofanventanuco.jpg"
                    fanart_info = "http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg"
                    fanart_trailer = "http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg"
                    category= ""
                    itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="findvideos", thumbnail=item.thumbnail, fanart="http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg" ,extra=extra, category= category,  show=show ,plot= plot, folder=True) )
        #fanart
        for  id, info in matches:
            
            category = id
            plot = info
            id_serie = id
            
            url ="http://thetvdb.com/api/1D62F2F90030C444/series/"+id_serie+"/banners.xml"
            
            data = scrapertools.cachePage(url)
            data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
            patron = '<Banners><Banner>.*?<VignettePath>(.*?)</VignettePath>'
            matches = re.compile(patron,re.DOTALL).findall(data)
            try:
                postertvdb = scrapertools.get_match(data,'<Banners><Banner>.*?<BannerPath>posters/(.*?)</BannerPath>')
                postertvdb =  "http://thetvdb.com/banners/_cache/posters/" + postertvdb
            except:
                postertvdb = item.thumbnail
                                
            if len(matches)==0:
                extra="http://s6.postimg.org/rv2mu3pap/bityouthsinopsis2.png"
                show = title+"|"+year+"|"+"http://s6.postimg.org/mh3umjzkh/bityouthnofanventanuco.jpg"
                fanart_info = "http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg"
                fanart_trailer = "http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg"
                itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="findvideos", thumbnail=postertvdb, fanart="http://s6.postimg.org/6ucl96lsh/bityouthnofan.jpg"  ,category = category, extra=extra, show=show,folder=True) )
                                                        
            for fan in matches:
                fanart="http://thetvdb.com/banners/" + fan
                fanart_1= fanart
                patron= '<Banners><Banner>.*?<BannerPath>.*?</BannerPath>.*?</Banner><Banner>.*?<BannerPath>(.*?)</BannerPath>.*?</Banner><Banner>.*?<BannerPath>(.*?)</BannerPath>.*?</Banner><Banner>.*?<BannerPath>(.*?)</BannerPath>'
                matches = re.compile(patron,re.DOTALL).findall(data)
                if len(matches)==0:
                   fanart_info= fanart_1
                   fanart_trailer = fanart_1
                   fanart_2 = fanart_1
                   show = title+"|"+year+"|"+fanart_1
                   extra=postertvdb
                   itemlist.append( Item(channel=item.channel, title=item.title, url=item.url, action="findvideos", thumbnail=postertvdb, fanart=fanart_1  ,category = category, extra=extra, show=show,folder=True) )
                for fanart_info, fanart_trailer, fanart_2 in matches:
                    fanart_info = "http://thetvdb.com/banners/" + fanart_info
                    fanart_trailer = "http://thetvdb.com/banners/" + fanart_trailer
                    fanart_2 = "http://thetvdb.com/banners/" + fanart_2
                
                    itemlist.append( Item(channel=item.channel, title =item.title, url=item.url, action="findvideos", thumbnail=postertvdb, fanart=fanart_1 , extra= fanart_2,folder=True) )
    title ="Info"
    title = title.replace(title,"[COLOR skyblue][B]"+title+"[/B][/COLOR]")
    if "Series" in item.url:
        thumbnail = postertvdb
    else:
        thumbnail = posterdb

    itemlist.append( Item(channel=item.channel, action="info" , title=title , url=item.url, thumbnail=thumbnail, fanart=fanart_info , folder=False ))

    return itemlist

Example 100

Project: bitcurator Source File: ttfonts.py
Function: extract_info
    def extractInfo(self):
        #################/
        # name - Naming table
        #################/
        self.sFamilyClass = 0
        self.sFamilySubClass = 0

        name_offset = self.seek_table("name")
        format = self.read_ushort()
        if format != 0:
            die("Unknown name table format " + format)
        numRecords = self.read_ushort()
        string_data_offset = name_offset + self.read_ushort()
        names = {1: '', 2: '', 3: '', 4: '', 6: ''}
        K = list(names.keys())
        nameCount = len(names)
        for i in range(numRecords):
            platformId = self.read_ushort()
            encodingId = self.read_ushort()
            languageId = self.read_ushort()
            nameId = self.read_ushort()
            length = self.read_ushort()
            offset = self.read_ushort()
            if (nameId not in K):
                continue
            N = ''
            if (platformId == 3 and encodingId == 1 and languageId == 0x409):  # Microsoft, Unicode, US English, PS Name
                opos = self._pos
                self.seek(string_data_offset + offset)
                if (length % 2 != 0):
                    die("PostScript name is UTF-16BE string of odd length")
                length /= 2
                N = ''
                while (length > 0):
                    char = self.read_ushort()
                    N += (chr(char))
                    length -= 1
                self._pos = opos
                self.seek(opos)

            elif (platformId == 1 and encodingId == 0 and languageId == 0):  # Macintosh, Roman, English, PS Name
                opos = self._pos
                N = self.get_chunk(string_data_offset + offset, length)
                self._pos = opos
                self.seek(opos)

            if (N and names[nameId] == ''):
                names[nameId] = N
                nameCount -= 1
                if (nameCount == 0):
                    break

        if names[6]:
            psName = names[6]
        elif names[4]:
            psName = re.sub(' ', '-', names[4])
        elif names[1]:
            psName = re.sub(' ', '-', names[1])
        else:
            psName = ''
            
        if not psName:
            die("Could not find PostScript font name")
            
        self.name = psName
        if names[1]:
            self.familyName = names[1]
        else:
            self.familyName = psName
            
        if names[2]:
            self.styleName = names[2]
        else:
            self.styleName = 'Regular'
            
        if names[4]:
            self.fullName = names[4]
        else:
            self.fullName = psName
            
        if names[3]:
            self.uniqueFontID = names[3]
        else:
            self.uniqueFontID = psName
            
        if names[6]:
            self.fullName = names[6]

        #################/
        # head - Font header table
        #################/
        self.seek_table("head")
        self.skip(18)
        self.unitsPerEm = unitsPerEm = self.read_ushort()
        scale = 1000 / float(unitsPerEm)
        self.skip(16)
        xMin = self.read_short()
        yMin = self.read_short()
        xMax = self.read_short()
        yMax = self.read_short()
        self.bbox = [(xMin * scale), (yMin * scale),
                     (xMax * scale), (yMax * scale)]
        self.skip(3 * 2)
        
        indexToLocFormat = self.read_ushort()
        glyphDataFormat = self.read_ushort()
        
        if glyphDataFormat != 0:
            die('Unknown glyph data format ' + glyphDataFormat)

        #################/
        # hhea metrics table
        #################/
        # ttf2t1 seems to use this value rather than the one in OS/2 - so put in for compatibility
        if "hhea" in self.tables:
            self.seek_table("hhea")
            self.skip(4)
            hheaAscender = self.read_short()
            hheaDescender = self.read_short()
            self.ascent = hheaAscender * scale
            self.descent = hheaDescender * scale

        #################/
        # OS/2 - OS/2 and Windows metrics table
        #################/
        if "OS/2" in self.tables:
            self.seek_table("OS/2")
            version = self.read_ushort()
            self.skip(2)
            usWeightClass = self.read_ushort()
            self.skip(2)
            fsType = self.read_ushort()
            if fsType == 0x0002 or (fsType & 0x0300) != 0:
                die('ERROR - Font file ' + self.filename + ' cannot be embedded due to copyright restrictions.')
                self.restrictedUse = True

            self.skip(20)
            sF = self.read_short()
            self.sFamilyClass = (sF >> 8)
            self.sFamilySubClass = (sF & 0xFF)
            self._pos += 10  # PANOSE = 10 byte length
            panose = self.fh.read(10)
            self.skip(26)
            sTypoAscender = self.read_short()
            sTypoDescender = self.read_short()
            
            if not self.ascent:
                self.ascent = (sTypoAscender * scale)
            if not self.descent:
                self.descent = (sTypoDescender * scale)
            if version > 1:
                self.skip(16)
                sCapHeight = self.read_short()
                self.capHeight = (sCapHeight * scale)
            else:
                self.capHeight = self.ascent

        else:
            usWeightClass = 500
            if not self.ascent:
                self.ascent = (yMax * scale)
            if not self.descent:
                self.descent = (yMin * scale)
            self.capHeight = self.ascent

        self.stemV = 50 + int(pow((usWeightClass / 65.0), 2))

        #################/
        # post - PostScript table
        #################/
        self.seek_table("post")
        self.skip(4)
        self.italicAngle = self.read_short() + self.read_ushort() / 65536.0
        self.underlinePosition = self.read_short() * scale
        self.underlineThickness = self.read_short() * scale
        isFixedPitch = self.read_ulong()

        self.flags = 4

        if self.italicAngle != 0:
            self.flags = self.flags | 64
            
        if usWeightClass >= 600:
            self.flags = self.flags | 262144
            
        if isFixedPitch:
            self.flags = self.flags | 1

        #################/
        # hhea - Horizontal header table
        #################/
        self.seek_table("hhea")
        self.skip(32)
        metricDataFormat = self.read_ushort()
        if (metricDataFormat != 0):
            die('Unknown horizontal metric data format '.metricDataFormat)
        numberOfHMetrics = self.read_ushort()
        if (numberOfHMetrics == 0):
            die('Number of horizontal metrics is 0')

        #################/
        # maxp - Maximum profile table
        #################/
        self.seek_table("maxp")
        self.skip(4)
        numGlyphs = self.read_ushort()

        #################/
        # cmap - Character to glyph index mapping table
        #################/
        cmap_offset = self.seek_table("cmap")
        self.skip(2)
        cmapTableCount = self.read_ushort()
        unicode_cmap_offset = 0
        for i in range(cmapTableCount):
            platformID = self.read_ushort()
            encodingID = self.read_ushort()
            offset = self.read_ulong()
            save_pos = self._pos

            if (platformID == 3 and encodingID == 1) or platformID == 0:  # Microsoft, Unicode
                format_ = self.get_ushort(cmap_offset + offset)
                print('Format is %s' % format_)
                if format_ == 4:
                    if not unicode_cmap_offset:
                        unicode_cmap_offset = cmap_offset + offset
                    break
            self.seek(save_pos)

        if not unicode_cmap_offset:
            die('Font (' + self.filename + ') does not have cmap for Unicode (platform 3, encoding 1, format 4, or platform 0, any encoding, format 4)')

        glyphToChar = {}
        charToGlyph = {}
        self.getCMAP4(unicode_cmap_offset, glyphToChar, charToGlyph)

        #################/
        # hmtx - Horizontal metrics table
        #################/
        self.getHMTX(numberOfHMetrics, numGlyphs, glyphToChar, scale)
See More Examples - Go to Next Page
Page 1 Page 2 Selected Page 3 Page 4