scrapi.util.copy_to_unicode

Here are the examples of the python api scrapi.util.copy_to_unicode taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

10 Examples 7

Example 1

Project: scrapi
Source File: cassandra.py
View license
    @events.logged(events.PROCESSING, 'normalized.cassandra')
    def process_normalized(self, raw_doc, normalized):
        self.send_to_database(
            source=copy_to_unicode(raw_doc['source']),
            docID=copy_to_unicode(raw_doc['docID']),
            contributors=copy_to_unicode(json.dumps(normalized['contributors'])),
            description=copy_to_unicode(normalized.get('description')),
            uris=copy_to_unicode(json.dumps(normalized['uris'])),
            providerUpdatedDateTime=parse(normalized['providerUpdatedDateTime']).replace(tzinfo=None),
            freeToRead=copy_to_unicode(json.dumps(normalized.get('freeToRead', {}))),
            languages=normalized.get('language'),
            licenses=copy_to_unicode(json.dumps(normalized.get('licenseRef', []))),
            publisher=copy_to_unicode(json.dumps(normalized.get('publisher', {}))),
            sponsorships=copy_to_unicode(json.dumps(normalized.get('sponsorship', []))),
            title=copy_to_unicode(normalized['title']),
            version=copy_to_unicode(json.dumps(normalized.get('version'), {})),
            otherProperties=copy_to_unicode(json.dumps(normalized.get('otherProperties', {}))),
            shareProperties=copy_to_unicode(json.dumps(normalized['shareProperties']))
        ).save()

Example 2

Project: scrapi
Source File: cassandra.py
View license
    @events.logged(events.PROCESSING, 'raw.cassandra')
    def process_raw(self, raw_doc):
        self.send_to_database(
            source=copy_to_unicode(raw_doc['source']),
            docID=copy_to_unicode(raw_doc['docID']),
            doc=six.text_type(raw_doc['doc']).encode('utf-8'),
            filetype=copy_to_unicode(raw_doc['filetype']),
            timestamps=raw_doc.get('timestamps', {})
        ).save()

Example 3

Project: scrapi
Source File: clinicaltrials.py
View license
    def harvest(self, start_date=None, end_date=None):
        """ First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information """

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        end_month = end_date.strftime('%m')
        end_day = end_date.strftime('%d')
        end_year = end_date.strftime('%Y')

        start_month = start_date.strftime('%m')
        start_day = start_date.strftime('%d')
        start_year = start_date.strftime('%Y')

        base_url = 'http://clinicaltrials.gov/ct2/results?lup_s='
        url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\
            format(start_month, start_day, start_year, end_month, end_day, end_year)

        url = base_url + url_end

        # grab the total number of studies
        initial_request = requests.get(url)
        record_encoding = initial_request.encoding
        initial_request_xml = etree.XML(initial_request.content)
        count = int(initial_request_xml.xpath('//search_results/@count')[0])
        xml_list = []
        if int(count) > 0:
            # get a new url with all results in it
            url = url + '&count=' + str(count)
            total_requests = requests.get(url)
            initial_doc = etree.XML(total_requests.content)

            # make a list of urls from that full list of studies
            study_urls = []
            for study in initial_doc.xpath('//clinical_study'):
                study_urls.append(study.xpath('url/node()')[0] + '?displayxml=true')

            # grab each of those urls for full content
            logger.info("There are {} urls to harvest - be patient...".format(len(study_urls)))
            count = 0
            official_count = 0
            for study_url in study_urls:
                try:
                    content = requests.get(study_url)
                except requests.exceptions.ConnectionError as e:
                    logger.info('Connection error: {}, wait a bit...'.format(e))
                    time.sleep(30)
                    continue
                doc = etree.XML(content.content)
                record = etree.tostring(doc, encoding=record_encoding)
                doc_id = doc.xpath('//nct_id/node()')[0]
                xml_list.append(RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml',
                }))
                official_count += 1
                count += 1
                if count % 100 == 0:
                    logger.info("You've requested {} studies, keep going!".format(official_count))
                    count = 0

        return xml_list

Example 4

Project: scrapi
Source File: dailyssrn.py
View license
    def harvest(self, start_date=None, end_date=None):

        url = 'http://dailyssrn.com/rss/rss-all-2.0.xml'

        data = requests.get(url, force=True)
        doc = etree.XML(data.content)

        records = doc.xpath('channel/item')

        xml_list = []
        for record in records:
            doc_id = parse_id_from_url(record.xpath('link/node()'))
            record = etree.tostring(record)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list

Example 5

Project: scrapi
Source File: dataone.py
View license
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            # This ID is unique per data package, but won't unify multiple packages for the same project
            doc_id = record.xpath("str[@name='id']")[0].text
            format_type = record.xpath("str[@name='formatType']")[0].text
            record = ElementTree.tostring(record, encoding=self.record_encoding)
            if format_type.lower() != 'metadata':
                logger.info('Not normalizing record with ID {}, type {}'.format(doc_id, format_type))
            else:
                xml_list.append(RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list

Example 6

Project: scrapi
Source File: doepages.py
View license
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}'
        url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date))
        initial_data = requests.get(url)
        record_encoding = initial_data.encoding
        initial_doc = etree.XML(initial_data.content)

        num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0])

        url = base_url.format(num_results, start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('records/record')

        xml_list = []
        for record in records:
            doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record, encoding=record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list

Example 7

Project: scrapi
Source File: ncar.py
View license
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}'

        url = base_url.format(start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces)

        xml_list = []
        for record in records:
            doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list

Example 8

Project: scrapi
Source File: nih.py
View license
    def harvest(self, start_date=None, end_date=None):
        """
        Return a list of RawDocuments
        """
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://exporter.nih.gov/'
        table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'

        # get ExPORTER page html and rows storing records
        html = requests.get(table_url).content
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData")
        rows = table.find_all('tr', class_="row_bg")
        urls = [i for i in construct_urls(base_url, start_date, end_date, rows)]

        return [
            RawDocument({
                'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING),
                'source': self.short_name,
                'docID': copy_to_unicode(record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]),
                'filetype': 'xml'
            }) for record in xml_records(get_xml_files(urls))
        ]

Example 9

Project: scrapi
Source File: noaa_nodc.py
View license
    def harvest(self, start_date=None, end_date=None):
        ''' First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information '''

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()
        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        # grab each of those urls for full content
        xml_list = []
        xml_base_url = self.canonical_base_url + '&view=xml'
        for dataset_id in self.query_by_date(start_date, end_date):
            try:
                item_url = str(xml_base_url).format(dataset_id)
                content = requests.get(item_url, throttle=2)
            except exceptions.ConnectionError as e:
                logger.info('Connection error: {}, wait a bit...'.format(e))
                time.sleep(30)
                content = requests.get(item_url)
            doc = etree.XML(content.content)

            record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(dataset_id),
                'filetype': 'xml',
            }))

        return xml_list

Example 10

Project: scrapi
Source File: test_utils.py
View license
    def test_copy_to_unicode(self):
        converted = util.copy_to_unicode('test')

        assert converted == u'test'
        assert isinstance(converted, six.text_type)