#! /usr/bin/python # Requests current BBC prog, gets relevant keywords, sends off relevant Twitter searches # One requester for ALL channels - avoids creating multiple Twitter streams # Only sends keywords to Twitter once per programme, then monitors for the next programme from datetime import date import os import re import string import time from Axon.Ipc import producerFinished from Axon.Ipc import shutdownMicroprocess from Axon.ThreadedComponent import threadedcomponent import MySQLdb import cjson import rdflib from rdflib.Graph import Graph class Requester(threadedcomponent): Inboxes = { "inbox" : "", "control" : "", "whatson" : "Receives back what's currently on a channel - [pid,title,timeoffset,duration,expectedstarttime]", "proginfo" : "Receives back raw RDF data for a PID", "search" : "Receives back raw Twitter people search JSON", "datain" : "URL contents returns from getter component", } Outboxes = { "outbox" : "Sends out keywords and pid(s) for streaming API connections - [[keyword,keyword],[pid,pid,pid]]", "signal" : "", "whatson" : "Requests current programmes by sending a channel name", "proginfo" : "Requests RDF format data for a pid - [pid, 'rdf']", "search" : "Sends people's names for Twitter username identification", "dataout" : "URL requests to getter component", } def __init__(self, channel,dbuser,dbpass): super(Requester, self).__init__() self.channel = channel self.dbuser = dbuser self.dbpass = dbpass # Keep a record of the current PID for each channel here self.channels = { "bbcone" : "", "bbctwo" : "", "bbcthree" : "", "bbcfour" : "", "cbbc" : "", "cbeebies" : "", "bbcnews" : "", "radio1" : "", "radio2" : "", "radio3" : "", "radio4" : "", "5live" : "", "worldservice" : "", "6music" : "", "radio7" : "", "1xtra" : "", "bbcparliament" : "", "asiannetwork" : "", "sportsextra" : "" } # Brand PIDs associated with programmes. New progs don't always have brands, but it's a start # Ideally this would be replaced by the BBC Buzz database, but that's not yet accessible AFAIK and doesn't always store tags for new programmes. # This doesn't help in the channel case where for example radio 1 uses @bbcr1 self.officialbrandtags = { "b00vc3rz" : ["#genius","bbcgenius"], # Genius with Dave Gorman "b006t1q9" : ["#bbcqt","bbcquestiontime"], # Question Time "b009w2w3" : ["#laterjools", "bbclater"], # Later with Jools Holland "b00lwxj1" : ["bbcbang"], # Bang goes the theory "b006m8dq" : ["#scd", "bbcstrictly"], # Strictly come dancing "b006ml0g" : ["qikipedia", "#qi"], # QI "b00j4j7g" : ["#f1"], # Formula 1 "b006wkqb" : ["chrisdjmoyles","chrismoylesshow"], # Chris Moyles Breakfast Show "b0071b63" : ["bbcapprentice"], # The Apprentice "b006mg74" : ["bbcwatchdog"], # Watchdog "b006v5tb" : ['bbcbreakfast'], # Breakfast "b006mkw3" : ["hignfy","bbchignfy"], # Have I Got News For You "b008dk4b" : ["childreninneed","bbccin","#cin","#pudsey","pudseybear"], # Children in Need } # Series PIDs associated with programmes. ONLY used where prog doesn't have a brand self.officialseriestags = { "b00v2z3s" : ["#askrhod"], # Ask Rhod Gilbert "b00vd7qz" : ['film2010'], # Film 2010 "b00vsw36" : ['manlab'] # James May's Man Lab } self.firstrun = True def finished(self): while self.dataReady("control"): msg = self.recv("control") if isinstance(msg, producerFinished) or isinstance(msg, shutdownMicroprocess): self.send(msg, "signal") return True return False def doStuff(self, channel): # Check what's on for each channel self.send(channel, "whatson") while not self.dataReady("whatson"): pass data = self.recv("whatson") if data == None: pid = None else: pid = data[0] title = data[1] offset = data[2] duration = data[3] expectedstart = data[4] if pid != self.channels[channel]: # Perhaps just do a duplicate scan before creating Twitter stream if pid == None: self.channels[channel] = None print (channel + ": Off Air") else: self.channels[channel] = pid self.send(["http://www.bbc.co.uk/programmes/" + pid + ".rdf"], "dataout") while not self.dataReady("datain"): pass recvdata = self.recv("datain") if recvdata[0] == "OK": programmedata = recvdata[1] else: # Fake programme data to prevent crash - not ideal programmedata = '<?xml version="1.0" encoding="utf-8"?> \ <rdf:RDF xmlns:rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \ xmlns:rdfs = "http://www.w3.org/2000/01/rdf-schema#" \ xmlns:owl = "http://www.w3.org/2002/07/owl#" \ xmlns:foaf = "http://xmlns.com/foaf/0.1/" \ xmlns:po = "http://purl.org/ontology/po/" \ xmlns:mo = "http://purl.org/ontology/mo/" \ xmlns:skos = "http://www.w3.org/2008/05/skos#" \ xmlns:time = "http://www.w3.org/2006/time#" \ xmlns:dc = "http://purl.org/dc/elements/1.1/" \ xmlns:dcterms = "http://purl.org/dc/terms/" \ xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \ xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \ xmlns:event = "http://purl.org/NET/c4dm/event.owl#"> \ </rdf:RDF>' # RDF reader needs to read from a file so write out first # Alternative is to read from a URL, but this lacks proper proxy support filepath = "tempRDF.txt" file = open(filepath, 'w') file.write(programmedata) file.close() g = Graph() # This is a temporary proxy fix. A URL could be put here instead g.parse("tempRDF.txt") # Identify the brand and whether there are any official hashtags twittags = list() for bid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Brand')): # bid is Brand ID bidmod = bid.replace("#programme","") bidmod = str(bidmod.replace("file:///programmes/","")) if self.officialbrandtags.has_key(bidmod): twittags = self.officialbrandtags[bidmod] break # Identify the series and whether there are any official hashtags if len(twittags) == 0: # Identify the brand and whether there are any official hashtags for sid in g.subjects(object = rdflib.URIRef('http://purl.org/ontology/po/Series')): # sid is Series ID sidmod = sid.replace("#programme","") sidmod = str(sidmod.replace("file:///programmes/","")) if self.officialseriestags.has_key(sidmod): twittags = self.officialseriestags[sidmod] break vidmod = "" so = g.subject_objects(predicate=rdflib.URIRef('http://purl.org/ontology/po/version')) # Pick a version, any version - for this which one doesn't matter for x in so: # vid is version id vid = x[1] vidmod = vid.replace("#programme","") vidmod = vidmod.replace("file:///programmes/","") break # Got version, now get people self.send(["http://www.bbc.co.uk/programmes/" + vidmod + ".rdf"], "dataout") while not self.dataReady("datain"): pass recvdata = self.recv("datain") if recvdata[0] == "OK": versiondata = recvdata[1] else: versiondata = '<?xml version="1.0" encoding="utf-8"?> \ <rdf:RDF xmlns:rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" \ xmlns:rdfs = "http://www.w3.org/2000/01/rdf-schema#" \ xmlns:owl = "http://www.w3.org/2002/07/owl#" \ xmlns:foaf = "http://xmlns.com/foaf/0.1/" \ xmlns:po = "http://purl.org/ontology/po/" \ xmlns:mo = "http://purl.org/ontology/mo/" \ xmlns:skos = "http://www.w3.org/2008/05/skos#" \ xmlns:time = "http://www.w3.org/2006/time#" \ xmlns:dc = "http://purl.org/dc/elements/1.1/" \ xmlns:dcterms = "http://purl.org/dc/terms/" \ xmlns:wgs84_pos= "http://www.w3.org/2003/01/geo/wgs84_pos#" \ xmlns:timeline = "http://purl.org/NET/c4dm/timeline.owl#" \ xmlns:event = "http://purl.org/NET/c4dm/event.owl#"> \ </rdf:RDF>' filepath = "tempRDF.txt" file = open(filepath, 'w') file.write(versiondata) file.close() g = Graph() g.parse("tempRDF.txt") # Identify if this is a change of programme, or the first time we've checked what's on for print clarity if self.firstrun: print (channel + ": " + title) else: print (channel + ": Changed to - " + title) # Minor alterations title = title.replace("&","and") if ":" in title: titlebits = title.split(":") title = titlebits[0] # Saving a copy here so apostrophes etc can be used in the Twitter people search titlesave = title # Remove punctuation for item in """!"#$%()*+,-./;<=>[email protected][\\]?_'`{|}?""": title = title.replace(item,"") keywords = dict() # Save keywords next to a descriptor of what they are keywords[pid] = "PID" # Add official hashtags to the list for tag in twittags: keywords[tag] = "Twitter" # Duplicates will be removed later # If the title has 'The' in it, add hashtags both with and without the 'the' to the keyword list # This simply broadens the list of search terms if string.find(title,"The",0,3) != -1: newtitle = string.replace(re.sub("\s+","",title),"The ","",1) keywords[channel] = "Channel" keywords["#" + string.lower(re.sub("\s+","",title))] = "Title" # Check for and remove year too - some programmes contain a year which may be undesirable from a search point of view keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title" keywords['#' + string.lower(re.sub("\s+","",newtitle))] = "Title" # Check for and remove year too keywords['#' + string.replace(string.lower(re.sub("\s+","",newtitle))," " + str(date.today().year),"",1)] = "Title" else: keywords[channel] = "Channel" keywords["#" + string.lower(re.sub("\s+","",title))] = "Title" keywords["#" + string.replace(string.lower(re.sub("\s+","",title))," " + str(date.today().year),"",1)] = "Title" allwordtitle = string.replace(title,"The ","",1) allwordtitle = allwordtitle.lower() # Remove current year from events allwordtitle = allwordtitle.replace(" " + str(date.today().year),"",1) titlewords = allwordtitle.split() if len(titlewords) > 1: keywords[allwordtitle] = "Title" else: # Trial fix for issue of one word titles producing huge amounts of data # This occurs for keywords like 'Weather' and 'Breakfast' which aren't BBC limited terms keywords[allwordtitle + "^" + "bbc"] = "Title" keywords["#" + re.sub("\s+","",allwordtitle)] = "Title" # Where a channel uses text for a number, we also want to search using the numeric representation numwords = dict({"one" : 1, "two" : 2, "three": 3, "four" : 4, "five": 5, "six" : 6, "seven": 7}) for word in numwords: if word in channel.lower() and channel != "asiannetwork": # Bug fix! asianne2rk numchannel = string.replace(channel.lower(),word,str(numwords[word])) keywords[numchannel] = "Channel" break if str(numwords[word]) in channel.lower(): numchannel = string.replace(channel.lower(),str(numwords[word]),word) keywords[numchannel] = "Channel" break # Load NameCache (people we've already searched for on Twitter to avoid hammering PeopleSearch) save = False try: homedir = os.path.expanduser("~") file = open(homedir + "/namecache.conf",'r') save = True except IOError, e: print ("Failed to load name cache - will attempt to create a new file: " + str(e)) if save: raw_config = file.read() file.close() try: config = cjson.decode(raw_config) except cjson.DecodeError, e: config = dict() else: config = dict() # Find people's names in retrieved RDF s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Role')) for x in s: rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x)) pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant')) firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName'))) lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName'))) if config.has_key(firstname + " " + lastname): # Found a cached value - this person has been searched for using Twitter if config[firstname + " " + lastname] != "": keywords[config[firstname + " " + lastname]] = "Twitter" else: # Not cached yet - new request to Twitter people search self.send(firstname + " " + lastname, "search") while not self.dataReady("search"): pass twitdata = self.recv("search") screenname = "" try: for user in twitdata: # Only use this Twitter screen name if there's a good chance they're the person we're after if user.has_key('verified'): if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname): screenname = user['screen_name'] keywords[screenname] = "Twitter" break except AttributeError, e: pass config[firstname + " " + lastname] = screenname keywords[firstname + " " + lastname] = "Participant" s = g.subjects(predicate=rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),object=rdflib.URIRef('http://purl.org/ontology/po/Character')) for x in s: character = str(g.value(subject=rdflib.BNode(x),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/name'))) rid = g.value(predicate=rdflib.URIRef('http://purl.org/ontology/po/role'),object=rdflib.BNode(x)) pid = g.value(subject=rdflib.BNode(rid),predicate=rdflib.URIRef('http://purl.org/ontology/po/participant')) firstname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/givenName'))) lastname = str(g.value(subject=rdflib.BNode(pid),predicate=rdflib.URIRef('http://xmlns.com/foaf/0.1/familyName'))) # This ^ is a temporary fix until I work out a better DB structure # Character names can sometimes be single common words, like 'James'. # For this reason, using this as a search term we require that either the channel name or programme title also appears in the tweet # The ^ signals to later states of this program that the channel name / title doesn't necessarily have to appear next to the character name keywords[character + "^" + channel] = "Character" keywords[character + "^" + title] = "Character" if " " in character: # Looks like we have a firstname + surname situation charwords = character.split() if charwords[0] != "Dr" and charwords[0] != "Miss" and charwords[0] != "Mr" and charwords[0] != "Mrs" and charwords[0] != "Ms" and charwords[0] != "The": # As long as the first word isn't a title, add it as a first name # This ^ is a temporary fix until I work out a better DB structure keywords[charwords[0] + "^" + channel] = "Character" keywords[charwords[0] + "^" + title] = "Character" elif len(charwords) > 2: # If the first word was a title, and the second word isn't a surname (checked by > 2) add the first name # This ^ is a temporary fix until I work out a better DB structure keywords[charwords[1] + "^" + channel] = "Character" keywords[charwords[1] + "^" + title] = "Character" if config.has_key(firstname + " " + lastname): # Found a cached value if config[firstname + " " + lastname] != "": keywords[config[firstname + " " + lastname]] = "Actor" else: # Not cached yet - new request self.send(firstname + " " + lastname, "search") while not self.dataReady("search"): pass twitdata = self.recv("search") screenname = "" try: for user in twitdata: if user.has_key('verified'): if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == string.lower(firstname + " " + lastname): screenname = user['screen_name'] keywords[screenname] = "Twitter" break except AttributeError, e: pass config[firstname + " " + lastname] = screenname keywords[firstname + " " + lastname] = "Actor" # Radio appears to have been forgotten about a bit in RDF / scheduling at the mo # So, let's do some extra queries and see if the show title is a person's name on Twitter if "radio" in channel or "6music" in channel or "asiannetwork" in channel or "sportsextra" in channel or "worldservice" in channel: # However, radio shows are often named using the DJ - The cases where this isn't true will cause problems however as they'll be saved in json - DOH! TODO if config.has_key(titlesave): # Found a cached value if config[titlesave] != "": keywords[config[titlesave]] = "Twitter" elif len(titlesave.split()) < 4: # Prevent some shows getting through at least - restricts people's names to three words self.send(titlesave, "search") while not self.dataReady("search"): pass twitdata = self.recv("search") screenname = "" try: for user in twitdata: if user.has_key('verified'): if (user['verified'] == True or user['followers_count'] > 10000) and string.lower(user['name']) == titlesave.lower(): screenname = user['screen_name'] keywords[screenname] = "Twitter" break except AttributeError, e: pass config[titlesave] = screenname try: file = open(homedir + "/namecache.conf",'w') raw_config = cjson.encode(config) file.write(raw_config) file.close() except IOError, e: print ("Failed to save name cache - could cause rate limit problems") return [keywords,data] else: if pid == None: print(channel + ": No change - Off Air") else: print (channel + ": No change - " + title) def dbConnect(self): db = MySQLdb.connect(user=self.dbuser,passwd=self.dbpass,db="twitter_bookmarks") cursor = db.cursor() return cursor def main(self): cursor = self.dbConnect() oldkeywords = None while not self.finished(): print ("### Checking current programmes ###") if self.channel != "all": oldpid = self.channels[self.channel] if oldpid == None: cursor.execute("""UPDATE programmes SET imported = 1 WHERE channel = %s""",(self.channel)) data = self.doStuff(self.channel) if data != None: keywords = data[0] pid = data[1][0] title = data[1][1] offset = data[1][2] duration = data[1][3] timestamp = data[1][4] utcoffset = data[1][5] cursor.execute("""UPDATE programmes SET imported = 1 WHERE pid != %s AND channel = %s""",(pid,self.channel)) cursor.execute("""SELECT channel FROM programmes WHERE pid = %s AND timestamp = %s""",(pid,timestamp)) progentrytest = cursor.fetchone() cursor.execute("""SELECT duration FROM programmes_unique WHERE pid = %s""",(pid)) progtest2 = cursor.fetchone() # If the pid and timestamp don't already appear together in the database... if progentrytest == None: cursor.execute("""INSERT INTO programmes (pid,timediff,timestamp,utcoffset,channel) VALUES (%s,%s,%s)""", (pid,offset,timestamp,utcoffset,self.channel)) # If the pid doesn't exist with ANY timestamps in the database... if progtest2 == None: cursor.execute("""INSERT INTO programmes_unique (pid,title,duration) VALUES (%s,%s,%s)""", (pid,title,duration)) for word in keywords: cursor.execute("""INSERT INTO keywords (pid,keyword,type) VALUES (%s,%s,%s)""", (pid,word,keywords[word])) else: # Fix for programmes where the duration is changed last minute if progtest2[0] < duration: #cursor.execute("""UPDATE programmes SET duration = %s WHERE pid = %s AND timestamp = %s""",(duration,pid,timestamp)) cursor.execute("""UPDATE programmes_unique SET duration = %s WHERE pid = %s""",(duration,pid)) keywords = list() else: keywords = None cursor.execute("""SELECT keyword FROM keywords WHERE pid = %s""",(pid)) keywordquery = cursor.fetchall() for keyword in keywordquery: # This ^ is a temporary fix until I work out a better DB structure # As mentioned earlier, the ^ is only relevant to the program, not to Twitter, so it must be removed if "^" in keyword[0]: keywords.append(string.replace(keyword[0],"^"," ")) else: keywords.append(keyword[0]) if (keywords != oldkeywords) & (keywords != None): print keywords self.send([keywords,[pid]],"outbox") pass else: # Still need to fix the 'changed to - off air' problem, but it isn't causing twitter keyword redos thankfully (purely a printing error) # Possible issue will start to occur if programmes change too often - tweet stream will miss too much keywords = list() for channel in self.channels: oldpid = self.channels[channel] if oldpid == None: cursor.execute("""UPDATE programmes SET imported = 1 WHERE channel = %s""",(channel)) data = self.doStuff(channel) if data != None: keywordappender = data[0] pid = data[1][0] title = data[1][1] offset = data[1][2] duration = data[1][3] timestamp = data[1][4] utcoffset = data[1][5] cursor.execute("""UPDATE programmes SET imported = 1 WHERE pid != %s AND channel = %s""",(pid,channel)) cursor.execute("""SELECT channel FROM programmes WHERE pid = %s AND timestamp = %s""",(pid,timestamp)) progentrytest = cursor.fetchone() cursor.execute("""SELECT duration FROM programmes_unique WHERE pid = %s""",(pid)) progtest2 = cursor.fetchone() if progentrytest == None: cursor.execute("""INSERT INTO programmes (pid,timediff,timestamp,utcoffset,channel) VALUES (%s,%s,%s,%s,%s)""", (pid,offset,timestamp,utcoffset,channel)) if progtest2 == None: cursor.execute("""INSERT INTO programmes_unique (pid,title,duration) VALUES (%s,%s,%s)""", (pid,title,duration)) for word in keywordappender: cursor.execute("""INSERT INTO keywords (pid,keyword,type) VALUES (%s,%s,%s)""", (pid,word,keywordappender[word])) else: # Fix for programmes where the duration is changed last minute if progtest2[0] < duration: #cursor.execute("""UPDATE programmes SET duration = %s WHERE pid = %s AND timestamp = %s""",(duration,pid,timestamp)) cursor.execute("""UPDATE programmes_unique SET duration = %s WHERE pid = %s""",(duration,pid)) currentpids = list() for channel in self.channels: if self.channels[channel] != "" and self.channels[channel] != None: currentpids.append(self.channels[channel]) for pid in currentpids: cursor.execute("""SELECT keyword FROM keywords WHERE pid = %s""",(pid)) keywordquery = cursor.fetchall() for keyword in keywordquery: # This ^ is a temporary fix until I work out a better DB structure if "^" in keyword[0]: keywords.append(string.replace(keyword[0],"^"," ")) else: keywords.append(keyword[0]) # Remove repeated keywords here if len(keywords) != 0: keywords = list(set(keywords)) if (keywords != oldkeywords) & (len(keywords) != 0): print keywords self.send([keywords,currentpids],"outbox") #epicfail: now need to send all pids, and search through them further down the line pass oldkeywords = keywords # At this point, find the version tags to allow further info finding # Then, pass keywords to TwitterStream. DataCollector will pick up the data # Must deal with errors passed back from TwitterStream here self.firstrun = False time.sleep(30) # Wait for 30 secs - don't need as much given the wait time between /programmes requests # Could always get this to wait until the programme is due to change, but this *may* miss last minute schedule changes