import os import re import subprocess import datetime from django.conf import settings from django.db import transaction from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag from pombola.hansard.models import Sitting, Entry, Venue # EXCEPTIONS class KenyaParserCouldNotParseTimeString(Exception): pass class KenyaParser(): """ # match National Assembly start and end times >>> result = KenyaParser.na_reg.match("The House met at 10.15 a.m.") >>> result.group('action') 'met' >>> result.group('time') '10.15 a.m.' >>> result = KenyaParser.na_reg.match("The House rose at 6.45 p.m.") >>> result.group('action') 'rose' # match the Senate start and end times >>> result = KenyaParser.sen_reg.match("The Senate met at 10.15 a.m.") >>> result.group('action') 'met' >>> result = KenyaParser.sen_reg.match("The House met at the Senate Chamber at 10.15 a.m.") >>> result.group('time') '10.15 a.m.' >>> result = KenyaParser.sen_reg.match("The House met at the Senate Chamber, Parliament Buildings, at 10.15 a.m.") >>> result.group('time') '10.15 a.m.' >>> result = KenyaParser.sen_reg.match("The House rose at 6.45 p.m.") >>> result.group('action') 'rose' >>> result = KenyaParser.sen_reg.match("The Senate rose at 6.45 p.m.") >>> result.group('action') 'rose' # it should cope with the plural form of 'Senate Chamber' >>> result = KenyaParser.sen_reg.match("The House met at the Senate Chambers, Parliament Buildings, at 2.30 p.m. [The Speaker (Hon. Ethuro) in the Chair]") >>> result.group('action') 'met' # it should cope with the word 'Main' appearing before 'Parliament Buildings' >>> result = KenyaParser.sen_reg.match("The Senate met at the Senate Chamber, Main Parliament Buildings, at 2.30 p.m.") >>> result.group('action') 'met' # match Joint Sitting start and end times >>> result = KenyaParser.joint_reg.match("Parliament rose at 6.45 p.m.") >>> result.groups() ('rose', '6.45 p.m.') >>> result = KenyaParser.joint_reg.match("Parliament met at Fifty eight minutes past Nine o'clock in the National Assembly Chamber") >>> result.groups() ('met', "Fifty eight minutes past Nine o'clock") """ na_reg = re.compile(r""" The\ House\s (?P<action>met|rose)\s (?:at\ )? (?P<time>\d+\.\d+\ [ap].m.)""", re.VERBOSE) # not ideal as this could accidentally match na_reg as well, # relies on na_reg always being checked first sen_reg = re.compile(r""" The\ (?:House|Senate)\s (?P<action>met|rose) (?: \ at\ the\ Senate\ Chambers? (?:,\ (?:Main\ )?Parliament\ Buildings,)? )? (?:\ at\ )? (?P<time>\d+\.\d+\ [ap].m.)""", re.VERBOSE) joint_reg = re.compile(r""" Parliament\s (?P<action>(?:met|rose))\s at\s (?P<time> (?:(?:[A-Za-z\- ]+\ minutes?\ (?:past|to)\ )?[A-Za-z]+\ o'clock) | (?:\d+\.\d+\ [ap].m.) ) (?:\ in\ the\ National\ Assembly\ Chamber)?""", re.VERBOSE) @classmethod def convert_pdf_to_html(cls, pdf_file): """Given a PDF parse it and return the HTML string representing it""" remote_host = settings.KENYA_PARSER_PDF_TO_HTML_HOST if remote_host: output = cls.convert_pdf_to_html_remote_machine(pdf_file, remote_host) else: output = cls.convert_pdf_to_html_local_machine( pdf_file ) # cleanup some known bad chars in the output output = re.sub("\xfe\xff", "", output) # BOM output = re.sub("\u201a\xc4\xf4", "\xe2\x80\x99", output) # smart quote return output @classmethod def convert_pdf_to_html_local_machine(cls, pdf_file): """Use local pdftohtml binary to convert the pdf to html""" pdftohtml_cmd = 'pdftohtml' # get the version number of pdftohtml and check that it is acceptable - see # 'hansard/notes.txt' for issues with the output from different versions. # Version output is sent to stderr ( ignore_me, version_error ) = subprocess.Popen( [ pdftohtml_cmd, '-v' ], shell = False, stderr = subprocess.PIPE, ).communicate() wanted_version = 'pdftohtml version 0.18.4' if wanted_version not in version_error: raise Exception( "Bad pdftohtml version - got '%s' but want '%s'" % (version_error, wanted_version) ) ( convert_output, ignore_me ) = subprocess.Popen( [ pdftohtml_cmd, '-stdout', '-noframes', '-enc', 'UTF-8', pdf_file.name ], shell = False, stdout = subprocess.PIPE, ).communicate() return convert_output @classmethod def convert_pdf_to_html_remote_machine(cls, pdf_file, remote): """Convert pdf on a remote machine""" bin_dir = os.path.abspath( os.path.dirname( __file__ ) + '/bin' ) remote_convert_script = os.path.join( bin_dir, 'convert_pdf_to_html_on_remote_machine.bash' ) remote_pdftohtml = subprocess.Popen( [ remote_convert_script, remote, pdf_file.name ], shell = False, stdout = subprocess.PIPE, ) ( output, ignore ) = remote_pdftohtml.communicate() return output @classmethod def convert_html_to_data(cls, html): # Clean out all the now. pdftohtml puts them to preserve the lines html = re.sub( r' ', ' ', html ) html = re.sub( r' ', ' ', html ) # create a soup out of the html soup = BeautifulSoup( html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES ) if not soup.body: raise Exception, "No <body> was found - output probably isn't HTML" contents = soup.body.contents # counters to use in the loops below br_count = 0 page_number = 1 filtered_contents = [] while len(contents): line = contents.pop(0) # get the tag name if there is one tag_name = line.name if type(line) == Tag else None # count <br> tags - we use two or more in succession to decide that # we've moved on to a new bit of text if tag_name == 'br': br_count += 1 continue # skip empty lines if tag_name == None: text_content = unicode(line) else: text_content = line.text if re.match( r'\s*$', text_content ): continue # For Assembly # check for something that looks like the page number - when found # delete it and the two lines that follow if tag_name == 'b': page_number_match = re.match( r'(\d+)\s{10,}', line.text ) if page_number_match: # up the page number - the match is the page that we are leaving page_number = int(page_number_match.group(0)) + 1 # skip on to the next page while len(contents): item = contents.pop(0) if type(item) == Tag and item.name == 'hr': break continue # For Senate # check for something that looks like the page number if tag_name == 'b': page_number_match = re.search( r'\s{10,}(\d+)', line.text ) if page_number_match: # set the page number - the match is the page that we are on page_number = int(page_number_match.group(0)) continue if tag_name == 'b': if re.search( r'\s*Disclaimer:', line.text ): # This is a disclaimer line that we can skip continue # if br_count > 0: # print 'br_count: ' + str(br_count) # print type( line ) # # if type(line) == Tag: print line.name # print "%s: >>>%s<<<" % (tag_name, text_content) # print '------------------------------------------------------' text_content = text_content.strip() text_content = re.sub( r'\s+', ' ', text_content ) filtered_contents.append(dict( tag_name = tag_name, text_content = text_content, br_count = br_count, page_number = page_number, )) br_count = 0 # go through all the filtered_content and using the br_count determine # when lines should be merged merged_contents = [] for line in filtered_contents: # print line br_count = line['br_count'] # Join lines that have the same tag_name and are not too far apart same_tag_name_test = ( br_count <= 1 and len(merged_contents) and line['tag_name'] == merged_contents[-1]['tag_name'] ) # Italic text in the current unstyled text inline_italic_test = ( br_count == 0 and len(merged_contents) and line['tag_name'] == 'i' and merged_contents[-1]['tag_name'] == None ) # Merge lines tha meet one of the above tests if ( same_tag_name_test or inline_italic_test ): new_content = ' '.join( [ merged_contents[-1]['text_content'], line['text_content'] ] ) new_content = re.sub( r'\s+,', ',', new_content ) merged_contents[-1]['text_content'] = new_content else: merged_contents.append( line ) # now go through and create some meaningful chunks from what we see meaningful_content = [] last_speaker_name = '' last_speaker_title = '' while len(merged_contents): line = merged_contents.pop(0) next_line = merged_contents[0] if len(merged_contents) else None # print '----------------------------------------' # print line # if the content is italic then it is a scene if line['tag_name'] == 'i': meaningful_content.append({ 'type': 'scene', 'text': line['text_content'], 'page_number': line['page_number'], }) continue # if the content is all caps then it is a heading if line['text_content'] == line['text_content'].upper(): meaningful_content.append({ 'type': 'heading', 'text': line['text_content'], 'page_number': line['page_number'], }) last_speaker_name = '' last_speaker_title = '' continue # It is a speech if we have a speaker and it is not formatted if line['tag_name'] == None and last_speaker_name: # do some quick smarts to see if we can extract a name from the # start of the speech. speech = line['text_content'] matches = re.match( r'\(([^\)]+)\):(.*)', speech ) if matches: last_speaker_title = last_speaker_name last_speaker_name = matches.group(1) speech = matches.group(2) else: # strip leading colons that may have been missed when the # name was extracted (usually the colon was outside the # bold tags around the name) speech = re.sub( r'^:\s*', '', speech) meaningful_content.append({ 'speaker_name': last_speaker_name, 'speaker_title': last_speaker_title, 'text': speech, 'type': 'speech', 'page_number': line['page_number'], }) # print meaningful_content[-1] continue # If it is a bold line and the next line is 'None' and is no # br_count away then we have the start of a speech. if ( line['tag_name'] == 'b' and next_line and next_line['tag_name'] == None and next_line['br_count'] == 0 ): last_speaker_name = line['text_content'].strip(':') last_speaker_title = '' continue meaningful_content.append({ 'type': 'other', 'text': line['text_content'], 'page_number': line['page_number'], }) last_speaker_name = '' last_speaker_title = '' hansard_data = { 'meta': cls.extract_meta_from_transcript( meaningful_content ), 'transcript': meaningful_content, } return hansard_data @classmethod def extract_meta_from_transcript(cls, transcript): # create the two venues national_assembly, created = Venue.objects.get_or_create( slug = 'national_assembly', defaults = {"name": "National Assembly"}, ) senate, created = Venue.objects.get_or_create( slug = 'senate', defaults = {"name": "Senate"}, ) reg = None venue = None # work out which one we should use for line in transcript: text = line.get('text', '') if cls.na_reg.search(text): reg = KenyaParser.na_reg venue = national_assembly break elif cls.sen_reg.search(text): reg = KenyaParser.sen_reg venue = senate break elif cls.joint_reg.search(text): reg = KenyaParser.joint_reg venue = national_assembly break if venue is None: raise Exception, "Failed to find the Venue" results = { 'venue': venue.slug, } for line in transcript: text = line.get('text', '') match = reg.match(text) if not match: continue groups = match.groupdict() hhmm = cls.parse_time_string( groups['time'] ) if groups['action'] == 'met': results['start_time'] = hhmm else: results['end_time'] = hhmm return results @classmethod def parse_time_string(cls, time_string): """Given a string input return HH:MM:SS format string, or raises SourceCouldNotParseTimeString if it can't be done""" # A quick google did not reveal a generic time parsing library, although # there must be one. time_regex = re.compile( r'(\d+)\.(\d+) (a|p)') match_simple = time_regex.match(time_string) verbose_time_regex = re.compile(r"(?:(?:(?P<minutes>[A-Za-z\- ]+) minutes? (?P<qualifier>past|to) ))?(?P<hour>[tA-Za-z]+) o'clock") match_verbose = verbose_time_regex.match(time_string) if match_simple: hour, minute, am_or_pm = match_simple.groups() hour = int(hour) minute = int(minute) if am_or_pm == 'p' and hour < 12: hour += 12 elif match_verbose: # attempt to translate words to numbers hour = cls.number_word_to_int(match_verbose.group('hour')) minute = cls.number_word_to_int(match_verbose.group('minutes')) # I don't think this phrasing is used, but just in case if match_verbose.group('qualifier') == 'to': minute = 60 - minute hour = hour - 1 # make horrible assumption to get am/pm if hour < 8: hour += 12 else: raise KenyaParserCouldNotParseTimeString( "bad time string: '%s'" % time_string ) return '%02u:%02u:00' % (hour,minute) @classmethod # adapted from http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers-python#answer-493788 def number_word_to_int(cls, textnum): if not textnum: return 0 numwords = {} units = [ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", ] tens = ["", "", "twenty", "thirty", "forty", "fifty"] for idx, word in enumerate(units): numwords[word] = (1, idx) for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) textnum = textnum.replace('-', ' ').lower().strip() current = 0 for word in textnum.split(): if word not in numwords: raise Exception("Illegal word: " + word) scale, increment = numwords[word] current = current * scale + increment return current @classmethod def create_entries_from_data_and_source( cls, data, source ): """Create the needed sitting and entries""" venue = Venue.objects.get( slug=data['meta']['venue'] ) # Joint Sittings can be published by both Houses (identical documents) # prevent the same Sitting being created twice if 'Joint Sitting' in source.name \ and Sitting.objects.filter( venue=venue, source__name=source.name, start_date=source.date, start_time=data['meta'].get('start_time', None) ).exists(): print "skipping duplicate source %s for %s" % (source.name, source.date) return None sitting = Sitting( source = source, venue = venue, start_date = source.date, start_time = data['meta'].get('start_time', None), end_date = source.date, end_time = data['meta'].get('end_time', None), ) sitting.save() with transaction.atomic(): counter = 0 for line in data['transcript']: counter += 1 entry = Entry( sitting = sitting, type = line['type'], page_number = line['page_number'], text_counter = counter, speaker_name = line.get('speaker_name', ''), speaker_title = line.get('speaker_title', ''), content = line['text'], ) entry.save() source.last_processing_success = datetime.datetime.now() source.save() return None