scrapi.linter.document.NormalizedDocument

Here are the examples of the python api scrapi.linter.document.NormalizedDocument taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

4 Examples 7

Example 1

Project: scrapi
Source File: __init__.py
View license
    def normalize(self, raw_doc):
        transformed = self.transform(json.loads(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER)
        transformed['shareProperties'] = {
            'source': self.short_name,
            'docID': raw_doc['docID'],
            'filetype': raw_doc['filetype']
        }
        return NormalizedDocument(transformed, clean=True)

Example 2

Project: scrapi
Source File: __init__.py
View license
    def normalize(self, raw_doc):
        transformed = self.transform(etree.XML(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER)
        transformed['shareProperties'] = {
            'source': self.short_name,
            'docID': raw_doc['docID'],
            'filetype': raw_doc['filetype']
        }
        return NormalizedDocument(transformed, clean=True)

Example 3

Project: scrapi
Source File: push_api.py
View license
    def normalize(self, raw):
        raw_data = json.loads(raw['doc'])
        document = raw_data['jsonData']

        # This is a workaround for the push API did not have proper email validation
        for contributor in document['contributors']:
            if contributor.get('email', None) == '':
                del contributor['email']

        # If status is marked delted in push API, mark in shareProperties
        if raw_data['status'] == 'deleted':
            document['shareProperties']['status'] = 'deleted'

        return NormalizedDocument(document)

Example 4

Project: scrapi
Source File: test_migrations.py
View license
@pytest.mark.django_db
@pytest.mark.cassandra
@pytest.mark.parametrize('canonical', ['postgres', 'cassandra'])
@pytest.mark.parametrize('destination', ['postgres', 'cassandra'])
def test_cross_db_with_versions(canonical, destination, monkeypatch, index='test'):
    new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide'

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    # Get the test documents into the canonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    # Get a version in there too
    new_normalized = copy.deepcopy(NORMALIZED.attributes)
    new_normalized['title'] = new_title
    canonical_processor.process_normalized(RAW, NormalizedDocument(new_normalized))

    destination_processor = get_processor(destination)

    # Check to see canonical_processor versions are there, and destination are not
    canonical_versions = list(canonical_processor.get_versions(docID=RAW['docID'], source=RAW['source']))
    assert len(canonical_versions) == 3
    assert canonical_versions[1].normalized['title'] == NORMALIZED['title']
    assert canonical_versions[2].normalized['title'] == new_title

    destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index, versions=True)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    destination_versions = list(destination_processor.get_versions(docID=RAW['docID'], source=RAW['source']))
    assert len(destination_versions) == 3
    assert destination_versions[1].normalized['title'] == NORMALIZED['title']
    assert destination_versions[2].normalized['title'] == new_title

    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc