org.apache.iceberg.mapping.NameMapping

Here are the examples of the java api org.apache.iceberg.mapping.NameMapping taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

56 Examples 7

19 Source : SparkTableUtil.java
with Apache License 2.0
from apache

/**
 * Returns the data files in a parreplacedion by listing the parreplacedion location.
 * <p>
 * For Parquet and ORC parreplacedions, this will read metrics from the file footer. For Avro parreplacedions,
 * metrics are set to null.
 * <p>
 * Note: certain metrics, like NaN counts, that are only supported by iceberg file writers but not file footers, will
 * not be populated.
 *
 * @param parreplacedion parreplacedion key, e.g., "a=1/b=2"
 * @param uri parreplacedion location URI
 * @param format parreplacedion format, avro or parquet
 * @param spec a parreplacedion spec
 * @param conf a Hadoop conf
 * @param metricsConfig a metrics conf
 * @param mapping a name mapping
 * @return a List of DataFile
 */
public static List<DataFile> listParreplacedion(Map<String, String> parreplacedion, String uri, String format, ParreplacedionSpec spec, Configuration conf, MetricsConfig metricsConfig, NameMapping mapping) {
    if (format.contains("avro")) {
        return listAvroParreplacedion(parreplacedion, uri, spec, conf);
    } else if (format.contains("parquet")) {
        return listParquetParreplacedion(parreplacedion, uri, spec, conf, metricsConfig, mapping);
    } else if (format.contains("orc")) {
        return listOrcParreplacedion(parreplacedion, uri, spec, conf, metricsConfig, mapping);
    } else {
        throw new UnsupportedOperationException("Unknown parreplacedion format: " + format);
    }
}

19 Source : SparkTableUtil.java
with Apache License 2.0
from apache

/**
 * Returns the data files in a parreplacedion by listing the parreplacedion location.
 *
 * For Parquet and ORC parreplacedions, this will read metrics from the file footer. For Avro parreplacedions,
 * metrics are set to null.
 *
 * @param parreplacedion a parreplacedion
 * @param conf a serializable Hadoop conf
 * @param metricsConfig a metrics conf
 * @param mapping a name mapping
 * @return a List of DataFile
 */
public static List<DataFile> listParreplacedion(SparkParreplacedion parreplacedion, ParreplacedionSpec spec, SerializableConfiguration conf, MetricsConfig metricsConfig, NameMapping mapping) {
    return listParreplacedion(parreplacedion.values, parreplacedion.uri, parreplacedion.format, spec, conf.get(), metricsConfig, mapping);
}

19 Source : ParquetUtil.java
with Apache License 2.0
from apache

private static MessageType getParquetTypeWithIds(ParquetMetadata metadata, NameMapping nameMapping) {
    MessageType type = metadata.getFileMetaData().getSchema();
    if (ParquetSchemaUtil.hasIds(type)) {
        return type;
    }
    if (nameMapping != null) {
        return ParquetSchemaUtil.applyNameMapping(type, nameMapping);
    }
    return ParquetSchemaUtil.addFallbackIds(type);
}

19 Source : ParquetUtil.java
with Apache License 2.0
from apache

public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig, NameMapping nameMapping) {
    try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
        return footerMetrics(reader.getFooter(), Stream.empty(), metricsConfig, nameMapping);
    } catch (IOException e) {
        throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
    }
}

19 Source : ParquetReadSupport.java
with Apache License 2.0
from apache

/**
 * Parquet {@link ReadSupport} that handles column projection based on {@link Schema} column IDs.
 *
 * @param <T> Java type produced by this read support instance
 */
clreplaced ParquetReadSupport<T> extends ReadSupport<T> {

    private final Schema expectedSchema;

    private final ReadSupport<T> wrapped;

    private final boolean callInit;

    private final NameMapping nameMapping;

    ParquetReadSupport(Schema expectedSchema, ReadSupport<T> readSupport, boolean callInit, NameMapping nameMapping) {
        this.expectedSchema = expectedSchema;
        this.wrapped = readSupport;
        this.callInit = callInit;
        this.nameMapping = nameMapping;
    }

    @Override
    @SuppressWarnings("deprecation")
    public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
        // Columns are selected from the Parquet file by taking the read context's message type and
        // matching to the file's columns by full path, so this must select columns by using the path
        // in the file's schema.
        MessageType projection;
        if (ParquetSchemaUtil.hasIds(fileSchema)) {
            projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
        } else if (nameMapping != null) {
            MessageType typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping);
            projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema);
        } else {
            projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
        }
        // override some known backward-compatibility options
        configuration.set("parquet.strict.typing", "false");
        configuration.set("parquet.avro.add-list-element-records", "false");
        configuration.set("parquet.avro.write-old-list-structure", "false");
        // set Avro schemas in case the reader is Avro
        AvroReadSupport.setRequestedProjection(configuration, AvroSchemaUtil.convert(expectedSchema, projection.getName()));
        org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()), expectedSchema, ImmutableMap.of());
        AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));
        // let the context set up read support metadata, but always use the correct projection
        ReadContext context = null;
        if (callInit) {
            try {
                context = wrapped.init(configuration, keyValueMetaData, projection);
            } catch (UnsupportedOperationException e) {
                // try the InitContext version
                context = wrapped.init(new InitContext(configuration, makeMultimap(keyValueMetaData), projection));
            }
        }
        return new ReadContext(projection, context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
    }

    @Override
    public RecordMaterializer<T> prepareForRead(Configuration configuration, Map<String, String> fileMetadata, MessageType fileMessageType, ReadContext readContext) {
        // This is the type created in init that was based on the file's schema. The schema that this
        // will preplaced to the wrapped ReadSupport needs to match the expected schema's names. Rather than
        // renaming the file's schema, convert the expected schema to Parquet. This relies on writing
        // files with the correct schema.
        // TODO: this breaks when columns are reordered.
        MessageType readSchema = ParquetSchemaUtil.convert(expectedSchema, fileMessageType.getName());
        return wrapped.prepareForRead(configuration, fileMetadata, readSchema, readContext);
    }

    private Map<String, Set<String>> makeMultimap(Map<String, String> map) {
        ImmutableMap.Builder<String, Set<String>> builder = ImmutableMap.builder();
        for (Map.Entry<String, String> entry : map.entrySet()) {
            builder.put(entry.getKey(), Sets.newHashSet(entry.getValue()));
        }
        return builder.build();
    }
}

19 Source : ApplyNameMapping.java
with Apache License 2.0
from apache

clreplaced ApplyNameMapping extends ParquetTypeVisitor<Type> {

    private final NameMapping nameMapping;

    ApplyNameMapping(NameMapping nameMapping) {
        this.nameMapping = nameMapping;
    }

    @Override
    public Type message(MessageType message, List<Type> fields) {
        Types.MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();
        fields.stream().filter(Objects::nonNull).forEach(builder::addField);
        return builder.named(message.getName());
    }

    @Override
    public Type struct(GroupType struct, List<Type> types) {
        MappedField field = nameMapping.find(currentPath());
        List<Type> actualTypes = types.stream().filter(Objects::nonNull).collect(Collectors.toList());
        Type structType = struct.withNewFields(actualTypes);
        return field == null ? structType : structType.withId(field.id());
    }

    @Override
    public Type list(GroupType list, Type elementType) {
        Preconditions.checkArgument(elementType != null, "List type must have element field");
        MappedField field = nameMapping.find(currentPath());
        Type listType = org.apache.parquet.schema.Types.list(list.getRepereplacedion()).element(elementType).named(list.getName());
        return field == null ? listType : listType.withId(field.id());
    }

    @Override
    public Type map(GroupType map, Type keyType, Type valueType) {
        Preconditions.checkArgument(keyType != null && valueType != null, "Map type must have both key field and value field");
        MappedField field = nameMapping.find(currentPath());
        Type mapType = org.apache.parquet.schema.Types.map(map.getRepereplacedion()).key(keyType).value(valueType).named(map.getName());
        return field == null ? mapType : mapType.withId(field.id());
    }

    @Override
    public Type primitive(PrimitiveType primitive) {
        MappedField field = nameMapping.find(currentPath());
        return field == null ? primitive : primitive.withId(field.id());
    }

    @Override
    public void beforeRepeatedElement(Type element) {
    // do not add the repeated element's name
    }

    @Override
    public void afterRepeatedElement(Type element) {
    // do not remove the repeated element's name
    }

    @Override
    public void beforeRepeatedKeyValue(Type keyValue) {
    // do not add the repeated element's name
    }

    @Override
    public void afterRepeatedKeyValue(Type keyValue) {
    // do not remove the repeated element's name
    }
}

19 Source : ORCSchemaUtil.java
with Apache License 2.0
from apache

static TypeDescription applyNameMapping(TypeDescription orcSchema, NameMapping nameMapping) {
    return OrcSchemaVisitor.visit(orcSchema, new ApplyNameMapping(nameMapping));
}

19 Source : OrcMetrics.java
with Apache License 2.0
from apache

public static Metrics fromInputFile(InputFile file, MetricsConfig metricsConfig, NameMapping mapping) {
    final Configuration config = (file instanceof HadoopInputFile) ? ((HadoopInputFile) file).getConf() : new Configuration();
    return fromInputFile(file, config, metricsConfig, mapping);
}

19 Source : ApplyNameMapping.java
with Apache License 2.0
from apache

clreplaced ApplyNameMapping extends OrcSchemaVisitor<TypeDescription> {

    private final NameMapping nameMapping;

    ApplyNameMapping(NameMapping nameMapping) {
        this.nameMapping = nameMapping;
    }

    @Override
    public String elementName() {
        return "element";
    }

    @Override
    public String keyName() {
        return "key";
    }

    @Override
    public String valueName() {
        return "value";
    }

    TypeDescription setId(TypeDescription type, MappedField mappedField) {
        if (mappedField != null) {
            type.setAttribute(ORCSchemaUtil.ICEBERG_ID_ATTRIBUTE, mappedField.id().toString());
        }
        return type;
    }

    @Override
    public TypeDescription record(TypeDescription record, List<String> names, List<TypeDescription> fields) {
        Preconditions.checkArgument(names.size() == fields.size(), "All fields must have names");
        MappedField field = nameMapping.find(currentPath());
        TypeDescription structType = TypeDescription.createStruct();
        for (int i = 0; i < fields.size(); i++) {
            String fieldName = names.get(i);
            TypeDescription fieldType = fields.get(i);
            if (fieldType != null) {
                structType.addField(fieldName, fieldType);
            }
        }
        return setId(structType, field);
    }

    @Override
    public TypeDescription list(TypeDescription array, TypeDescription element) {
        Preconditions.checkArgument(element != null, "List type must have element type");
        MappedField field = nameMapping.find(currentPath());
        TypeDescription listType = TypeDescription.createList(element);
        return setId(listType, field);
    }

    @Override
    public TypeDescription map(TypeDescription map, TypeDescription key, TypeDescription value) {
        Preconditions.checkArgument(key != null && value != null, "Map type must have both key and value types");
        MappedField field = nameMapping.find(currentPath());
        TypeDescription mapType = TypeDescription.createMap(key, value);
        return setId(mapType, field);
    }

    @Override
    public TypeDescription primitive(TypeDescription primitive) {
        MappedField field = nameMapping.find(currentPath());
        return setId(primitive.clone(), field);
    }
}

19 Source : TestAvroNameMapping.java
with Apache License 2.0
from apache

@Test
public void testAliases() throws IOException {
    Schema writeSchema = new Schema(Types.NestedField.optional(22, "points", Types.ListType.ofOptional(21, Types.StructType.of(Types.NestedField.required(19, "x", Types.IntegerType.get())))));
    Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table"));
    Record pointRecord = new Record(AvroSchemaUtil.fromOption(AvroSchemaUtil.fromOption(record.getSchema().getField("points").schema()).getElementType()));
    pointRecord.put("x", 1);
    record.put("points", ImmutableList.of(pointRecord));
    NameMapping nameMapping = NameMapping.of(MappedFields.of(MappedField.of(22, "points", MappedFields.of(MappedField.of(21, "element", MappedFields.of(MappedField.of(19, Lists.newArrayList("x"))))))));
    Schema readSchema = new Schema(Types.NestedField.optional(22, "points", Types.ListType.ofOptional(21, Types.StructType.of(// x renamed to y
    Types.NestedField.required(19, "y", Types.IntegerType.get())))));
    Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping);
    replacedert.replacedertEquals("x is read as y", 1, ((List<Record>) projected.get("points")).get(0).get("y"));
    readSchema = new Schema(Types.NestedField.optional(22, "points", Types.ListType.ofOptional(21, Types.StructType.of(// x renamed to z
    Types.NestedField.required(19, "z", Types.IntegerType.get())))));
    projected = writeAndRead(writeSchema, readSchema, record, nameMapping);
    replacedert.replacedertEquals("x is read as z", 1, ((List<Record>) projected.get("points")).get(0).get("z"));
}

19 Source : AvroSchemaUtil.java
with Apache License 2.0
from apache

public static Schema pruneColumns(Schema schema, Set<Integer> selectedIds, NameMapping nameMapping) {
    return new PruneColumns(selectedIds, nameMapping).rootSchema(schema);
}

18 Source : Spark3CreateAction.java
with Apache License 2.0
from apache

protected void ensureNameMappingPresent(Table table) {
    if (!table.properties().containsKey(TableProperties.DEFAULT_NAME_MAPPING)) {
        NameMapping nameMapping = MappingUtil.create(table.schema());
        String nameMappingJson = NameMappingParser.toJson(nameMapping);
        table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, nameMappingJson).commit();
    }
}

18 Source : TestNameMappingProjection.java
with Apache License 2.0
from apache

private void replacedertNameMappingProjection(DataFile dataFile, String tableName) {
    Schema filteredSchema = new Schema(required(1, "name", Types.StringType.get()));
    NameMapping nameMapping = MappingUtil.create(filteredSchema);
    Schema tableSchema = new Schema(required(1, "name", Types.StringType.get()), optional(2, "id", Types.IntegerType.get()));
    Table table = catalog.createTable(org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, tableName), tableSchema, ParreplacedionSpec.unparreplacedioned());
    table.updateProperties().set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
    table.newFastAppend().appendFile(dataFile).commit();
    List<Row> actual = spark.read().format("iceberg").load(String.format("%s.%s", DB_NAME, tableName)).filter("name='Alice'").collectAsList();
    replacedert.replacedertEquals("Should project 1 record", 1, actual.size());
    replacedert.replacedertEquals("Should equal to 'Alice'", "Alice", actual.get(0).getString(0));
    replacedert.replacedertNull("should be null", actual.get(0).get(1));
}

18 Source : ParquetSchemaUtil.java
with Apache License 2.0
from apache

public static MessageType applyNameMapping(MessageType fileSchema, NameMapping nameMapping) {
    return (MessageType) ParquetTypeVisitor.visit(fileSchema, new ApplyNameMapping(nameMapping));
}

18 Source : OrcMetrics.java
with Apache License 2.0
from apache

static Metrics fromInputFile(InputFile file, Configuration config, MetricsConfig metricsConfig, NameMapping mapping) {
    try (Reader orcReader = ORC.newFileReader(file, config)) {
        return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics(), Stream.empty(), metricsConfig, mapping);
    } catch (IOException ioe) {
        throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
    }
}

18 Source : OrcIterable.java
with Apache License 2.0
from apache

/**
 * Iterable used to read rows from ORC.
 */
clreplaced OrcIterable<T> extends CloseableGroup implements CloseableIterable<T> {

    private final Configuration config;

    private final Schema schema;

    private final InputFile file;

    private final Long start;

    private final Long length;

    private final Function<TypeDescription, OrcRowReader<?>> readerFunction;

    private final Expression filter;

    private final boolean caseSensitive;

    private final Function<TypeDescription, OrcBatchReader<?>> batchReaderFunction;

    private final int recordsPerBatch;

    private NameMapping nameMapping;

    OrcIterable(InputFile file, Configuration config, Schema schema, NameMapping nameMapping, Long start, Long length, Function<TypeDescription, OrcRowReader<?>> readerFunction, boolean caseSensitive, Expression filter, Function<TypeDescription, OrcBatchReader<?>> batchReaderFunction, int recordsPerBatch) {
        this.schema = schema;
        this.readerFunction = readerFunction;
        this.file = file;
        this.nameMapping = nameMapping;
        this.start = start;
        this.length = length;
        this.config = config;
        this.caseSensitive = caseSensitive;
        this.filter = (filter == Expressions.alwaysTrue()) ? null : filter;
        this.batchReaderFunction = batchReaderFunction;
        this.recordsPerBatch = recordsPerBatch;
    }

    @SuppressWarnings("unchecked")
    @Override
    public CloseableIterator<T> iterator() {
        Reader orcFileReader = ORC.newFileReader(file, config);
        addCloseable(orcFileReader);
        TypeDescription fileSchema = orcFileReader.getSchema();
        final TypeDescription readOrcSchema;
        if (ORCSchemaUtil.hasIds(fileSchema)) {
            readOrcSchema = ORCSchemaUtil.buildOrcProjection(schema, fileSchema);
        } else {
            if (nameMapping == null) {
                nameMapping = MappingUtil.create(schema);
            }
            TypeDescription typeWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, nameMapping);
            readOrcSchema = ORCSchemaUtil.buildOrcProjection(schema, typeWithIds);
        }
        SearchArgument sarg = null;
        if (filter != null) {
            Expression boundFilter = Binder.bind(schema.replacedtruct(), filter, caseSensitive);
            sarg = ExpressionToSearchArgument.convert(boundFilter, readOrcSchema);
        }
        VectorizedRowBatchIterator rowBatchIterator = newOrcIterator(file, readOrcSchema, start, length, orcFileReader, sarg, recordsPerBatch);
        if (batchReaderFunction != null) {
            OrcBatchReader<T> batchReader = (OrcBatchReader<T>) batchReaderFunction.apply(readOrcSchema);
            return CloseableIterator.transform(rowBatchIterator, pair -> {
                batchReader.setBatchContext(pair.second());
                return batchReader.read(pair.first());
            });
        } else {
            return new OrcRowIterator<>(rowBatchIterator, (OrcRowReader<T>) readerFunction.apply(readOrcSchema));
        }
    }

    private static VectorizedRowBatchIterator newOrcIterator(InputFile file, TypeDescription readerSchema, Long start, Long length, Reader orcFileReader, SearchArgument sarg, int recordsPerBatch) {
        final Reader.Options options = orcFileReader.options();
        if (start != null) {
            options.range(start, length);
        }
        options.schema(readerSchema);
        options.searchArgument(sarg, new String[] {});
        try {
            return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options), recordsPerBatch);
        } catch (IOException ioe) {
            throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
        }
    }

    private static clreplaced OrcRowIterator<T> implements CloseableIterator<T> {

        private int nextRow;

        private VectorizedRowBatch current;

        private int currentBatchSize;

        private final VectorizedRowBatchIterator batchIter;

        private final OrcRowReader<T> reader;

        OrcRowIterator(VectorizedRowBatchIterator batchIter, OrcRowReader<T> reader) {
            this.batchIter = batchIter;
            this.reader = reader;
            current = null;
            nextRow = 0;
            currentBatchSize = 0;
        }

        @Override
        public boolean hasNext() {
            return (current != null && nextRow < currentBatchSize) || batchIter.hasNext();
        }

        @Override
        public T next() {
            if (current == null || nextRow >= currentBatchSize) {
                Pair<VectorizedRowBatch, Long> nextBatch = batchIter.next();
                current = nextBatch.first();
                currentBatchSize = current.size;
                nextRow = 0;
                this.reader.setBatchContext(nextBatch.second());
            }
            return this.reader.read(current, nextRow++);
        }

        @Override
        public void close() throws IOException {
            batchIter.close();
        }
    }
}

18 Source : TestSchemaAndMappingUpdate.java
with Apache License 2.0
from apache

@Test
public void testModificationWithMetricsMetrics() {
    NameMapping mapping = MappingUtil.create(table.schema());
    String mappingJson = NameMappingParser.toJson(mapping);
    table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).set("write.metadata.metrics.column.id", "full").commit();
    replacedertHelpers.replacedertThrows("Creating metrics for non-existent column fails", ValidationException.clreplaced, null, () -> table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).set("write.metadata.metrics.column.ids", "full").commit());
    replacedertHelpers.replacedertThrows("Deleting a column with metrics fails", ValidationException.clreplaced, null, () -> table.updateSchema().deleteColumn("id").commit());
    String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping updated = NameMappingParser.fromJson(updatedJson);
    // should not change the mapping
    validateUnchanged(mapping, updated);
}

18 Source : TestSchemaAndMappingUpdate.java
with Apache License 2.0
from apache

@Test
public void testDeleteColumn() {
    NameMapping mapping = MappingUtil.create(table.schema());
    String mappingJson = NameMappingParser.toJson(mapping);
    table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit();
    table.updateSchema().deleteColumn("id").commit();
    String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping updated = NameMappingParser.fromJson(updatedJson);
    // should not change the mapping
    validateUnchanged(mapping, updated);
}

18 Source : TestAvroNameMapping.java
with Apache License 2.0
from apache

@Test
public void testMissingRequiredFields() {
    Schema writeSchema = new Schema(Types.NestedField.required(19, "x", Types.IntegerType.get()), Types.NestedField.optional(18, "y", Types.IntegerType.get()));
    Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table"));
    record.put("x", 1);
    record.put("y", 2);
    // table mapping not projecting a required field 'x'
    NameMapping nameMapping = MappingUtil.create(new Schema(Types.NestedField.optional(18, "y", Types.IntegerType.get())));
    Schema readSchema = writeSchema;
    replacedertHelpers.replacedertThrows("Missing required field in nameMapping", IllegalArgumentException.clreplaced, "Missing required field: x", // In this case, pruneColumns result is an empty record
    () -> writeAndRead(writeSchema, readSchema, record, nameMapping));
}

18 Source : SchemaUpdate.java
with Apache License 2.0
from apache

private TableMetadata applyChangesToMapping(TableMetadata metadata) {
    String mappingJson = metadata.property(TableProperties.DEFAULT_NAME_MAPPING, null);
    if (mappingJson != null) {
        try {
            // parse and update the mapping
            NameMapping mapping = NameMappingParser.fromJson(mappingJson);
            NameMapping updated = MappingUtil.update(mapping, updates, adds);
            // replace the table property
            Map<String, String> updatedProperties = Maps.newHashMap();
            updatedProperties.putAll(metadata.properties());
            updatedProperties.put(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(updated));
            return metadata.replaceProperties(updatedProperties);
        } catch (RuntimeException e) {
            // log the error, but do not fail the update
            LOG.warn("Failed to update external schema mapping: {}", mappingJson, e);
        }
    }
    return metadata;
}

18 Source : PruneColumns.java
with Apache License 2.0
from apache

clreplaced PruneColumns extends AvroSchemaVisitor<Schema> {

    private static final Logger LOG = LoggerFactory.getLogger(PruneColumns.clreplaced);

    private final Set<Integer> selectedIds;

    private final NameMapping nameMapping;

    PruneColumns(Set<Integer> selectedIds, NameMapping nameMapping) {
        Preconditions.checkNotNull(selectedIds, "Selected field ids cannot be null");
        this.selectedIds = selectedIds;
        this.nameMapping = nameMapping;
    }

    Schema rootSchema(Schema record) {
        Schema result = visit(record, this);
        if (result != null) {
            return result;
        }
        return copyRecord(record, ImmutableList.of());
    }

    @Override
    public Schema record(Schema record, List<String> names, List<Schema> fields) {
        // Then this should access the record's fields by name
        List<Schema.Field> filteredFields = Lists.newArrayListWithExpectedSize(fields.size());
        boolean hasChange = false;
        for (Schema.Field field : record.getFields()) {
            Integer fieldId = AvroSchemaUtil.getFieldId(field, nameMapping, fieldNames());
            if (fieldId == null) {
                // Both the schema and the nameMapping does not have field id. We prune this field.
                continue;
            }
            if (!AvroSchemaUtil.hasFieldId(field)) {
                // fieldId was resolved from nameMapping, we updated hasChange
                // flag to make sure a new field is created with the field id
                hasChange = true;
            }
            if (isOptionSchemaWithNonNullFirstOption(field.schema())) {
                // if the field has an optional schema where the first option is not NULL,
                // we update hasChange flag to make sure we reorder the schema and make the
                // NULL option as the first
                hasChange = true;
            }
            Schema fieldSchema = fields.get(field.pos());
            // All primitives are selected by selecting the field, but map and list
            // types can be selected by projecting the keys, values, or elements.
            // This creates two conditions where the field should be selected: if the
            // id is selected or if the result of the field is non-null. The only
            // case where the converted field is non-null is when a map or list is
            // selected by lower IDs.
            if (selectedIds.contains(fieldId)) {
                filteredFields.add(copyField(field, field.schema(), fieldId));
            } else if (fieldSchema != null) {
                hasChange = true;
                filteredFields.add(copyField(field, fieldSchema, fieldId));
            }
        }
        if (hasChange) {
            return copyRecord(record, filteredFields);
        } else if (filteredFields.size() == record.getFields().size()) {
            return record;
        } else if (!filteredFields.isEmpty()) {
            return copyRecord(record, filteredFields);
        }
        return null;
    }

    @Override
    public Schema union(Schema union, List<Schema> options) {
        Preconditions.checkState(AvroSchemaUtil.isOptionSchema(union), "Invalid schema: non-option unions are not supported: %s", union);
        // only unions with null are allowed, and a null schema results in null
        Schema pruned = null;
        if (options.get(0) != null) {
            pruned = options.get(0);
        } else if (options.get(1) != null) {
            pruned = options.get(1);
        }
        if (pruned != null) {
            if (pruned != AvroSchemaUtil.fromOption(union)) {
                return AvroSchemaUtil.toOption(pruned);
            }
            return union;
        }
        return null;
    }

    @Override
    @SuppressWarnings("checkstyle:CyclomaticComplexity")
    public Schema array(Schema array, Schema element) {
        if (array.getLogicalType() instanceof LogicalMap) {
            Schema keyValue = array.getElementType();
            Integer keyId = AvroSchemaUtil.getFieldId(keyValue.getField("key"), nameMapping, fieldNames());
            Integer valueId = AvroSchemaUtil.getFieldId(keyValue.getField("value"), nameMapping, fieldNames());
            if (keyId == null || valueId == null) {
                if (keyId != null || valueId != null) {
                    LOG.warn("Map schema {} should have both key and value ids set or both unset", array);
                }
                return null;
            }
            // if either key or value is selected, the whole map must be projected
            if (selectedIds.contains(keyId) || selectedIds.contains(valueId)) {
                return complexMapWithIds(array, keyId, valueId);
            } else if (element != null) {
                Schema.Field keyProjectionField = element.getField("key");
                Schema valueProjection = element.getField("value").schema();
                // it is possible that key is not selected, and
                // key schemas can be different if new field ids were replacedigned to them
                if (keyProjectionField != null && keyValue.getField("key").schema() != keyProjectionField.schema()) {
                    Preconditions.checkState(SchemaNormalization.parsingFingerprint64(keyValue.getField("key").schema()) == SchemaNormalization.parsingFingerprint64(keyProjectionField.schema()), "Map keys should not be projected");
                    return AvroSchemaUtil.createMap(keyId, keyProjectionField.schema(), valueId, valueProjection);
                } else if (keyValue.getField("value").schema() != valueProjection) {
                    return AvroSchemaUtil.createMap(keyId, keyValue.getField("key").schema(), valueId, valueProjection);
                } else {
                    return complexMapWithIds(array, keyId, valueId);
                }
            }
        } else {
            Integer elementId = AvroSchemaUtil.getElementId(array, nameMapping, fieldNames());
            if (elementId == null) {
                return null;
            }
            if (selectedIds.contains(elementId)) {
                return arrayWithId(array, elementId);
            } else if (element != null) {
                if (element != array.getElementType()) {
                    // the element must be a projection
                    return arrayWithId(Schema.createArray(element), elementId);
                }
                return arrayWithId(array, elementId);
            }
        }
        return null;
    }

    @Override
    public Schema map(Schema map, Schema value) {
        Integer keyId = AvroSchemaUtil.getKeyId(map, nameMapping, fieldNames());
        Integer valueId = AvroSchemaUtil.getValueId(map, nameMapping, fieldNames());
        if (keyId == null || valueId == null) {
            if (keyId != null || valueId != null) {
                LOG.warn("Map schema {} should have both key and value ids set or both unset", map);
            }
            return null;
        }
        // if either key or value is selected, the whole map must be projected
        if (selectedIds.contains(keyId) || selectedIds.contains(valueId)) {
            // replacedign ids. Ids may not always be present in the schema,
            // e.g if we are reading data not written by Iceberg writers
            return mapWithIds(map, keyId, valueId);
        } else if (value != null) {
            if (value != map.getValueType()) {
                // the value must be a projection
                return mapWithIds(Schema.createMap(value), keyId, valueId);
            }
            return map;
        }
        return null;
    }

    private Schema arrayWithId(Schema array, Integer elementId) {
        if (!AvroSchemaUtil.hasProperty(array, AvroSchemaUtil.ELEMENT_ID_PROP)) {
            Schema result = Schema.createArray(array.getElementType());
            result.addProp(AvroSchemaUtil.ELEMENT_ID_PROP, elementId);
            return result;
        }
        return array;
    }

    private Schema complexMapWithIds(Schema map, Integer keyId, Integer valueId) {
        Schema keyValue = map.getElementType();
        if (!AvroSchemaUtil.hasFieldId(keyValue.getField("key")) || !AvroSchemaUtil.hasFieldId(keyValue.getField("value"))) {
            return AvroSchemaUtil.createMap(keyId, keyValue.getField("key").schema(), valueId, keyValue.getField("value").schema());
        }
        return map;
    }

    private Schema mapWithIds(Schema map, Integer keyId, Integer valueId) {
        if (!AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.KEY_ID_PROP) || !AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.VALUE_ID_PROP)) {
            Schema result = Schema.createMap(map.getValueType());
            result.addProp(AvroSchemaUtil.KEY_ID_PROP, keyId);
            result.addProp(AvroSchemaUtil.VALUE_ID_PROP, valueId);
            return result;
        }
        return map;
    }

    @Override
    public Schema primitive(Schema primitive) {
        // primitives are not selected directly
        return null;
    }

    private static Schema copyRecord(Schema record, List<Schema.Field> newFields) {
        Schema copy = Schema.createRecord(record.getName(), record.getDoc(), record.getNamespace(), record.isError(), newFields);
        for (Map.Entry<String, Object> prop : record.getObjectProps().entrySet()) {
            copy.addProp(prop.getKey(), prop.getValue());
        }
        return copy;
    }

    private static Schema.Field copyField(Schema.Field field, Schema newSchema, Integer fieldId) {
        Schema newSchemaReordered;
        // if the newSchema is an optional schema, make sure the NULL option is always the first
        if (isOptionSchemaWithNonNullFirstOption(newSchema)) {
            newSchemaReordered = AvroSchemaUtil.toOption(AvroSchemaUtil.fromOption(newSchema));
        } else {
            newSchemaReordered = newSchema;
        }
        // do not copy over default values as the file is expected to have values for fields already in the file schema
        Schema.Field copy = new Schema.Field(field.name(), newSchemaReordered, field.doc(), AvroSchemaUtil.isOptionSchema(newSchemaReordered) ? JsonProperties.NULL_VALUE : null, field.order());
        for (Map.Entry<String, Object> prop : field.getObjectProps().entrySet()) {
            copy.addProp(prop.getKey(), prop.getValue());
        }
        if (AvroSchemaUtil.hasFieldId(field)) {
            int existingFieldId = AvroSchemaUtil.getFieldId(field);
            Preconditions.checkArgument(existingFieldId == fieldId, "Existing field does match with that fetched from name mapping");
        } else {
            // field may not have a fieldId if the fieldId was fetched from nameMapping
            copy.addProp(AvroSchemaUtil.FIELD_ID_PROP, fieldId);
        }
        return copy;
    }

    private static boolean isOptionSchemaWithNonNullFirstOption(Schema schema) {
        return AvroSchemaUtil.isOptionSchema(schema) && schema.getTypes().get(0).getType() != Schema.Type.NULL;
    }
}

18 Source : AvroSchemaUtil.java
with Apache License 2.0
from apache

static Integer getKeyId(Schema schema, NameMapping nameMapping, Iterable<String> parentFieldNames) {
    Preconditions.checkArgument(schema.getType() == MAP, "Cannot get map key id for non-map schema: %s", schema);
    List<String> names = Lists.newArrayList(parentFieldNames);
    names.add("key");
    return getId(schema, KEY_ID_PROP, nameMapping, names);
}

18 Source : AvroSchemaUtil.java
with Apache License 2.0
from apache

static Integer getValueId(Schema schema, NameMapping nameMapping, Iterable<String> parentFieldNames) {
    Preconditions.checkArgument(schema.getType() == MAP, "Cannot get map value id for non-map schema: %s", schema);
    List<String> names = Lists.newArrayList(parentFieldNames);
    names.add("value");
    return getId(schema, VALUE_ID_PROP, nameMapping, names);
}

18 Source : AvroSchemaUtil.java
with Apache License 2.0
from apache

static Integer getElementId(Schema schema, NameMapping nameMapping, Iterable<String> parentFieldNames) {
    Preconditions.checkArgument(schema.getType() == ARRAY, "Cannot get array element id for non-array schema: %s", schema);
    List<String> names = Lists.newArrayList(parentFieldNames);
    names.add("element");
    return getId(schema, ELEMENT_ID_PROP, nameMapping, names);
}

17 Source : SparkTableUtil.java
with Apache License 2.0
from apache

private static void importUnparreplacedionedSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable) {
    try {
        CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent);
        Option<String> format = sourceTable.storage().serde().nonEmpty() ? sourceTable.storage().serde() : sourceTable.provider();
        Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format");
        Map<String, String> parreplacedion = Collections.emptyMap();
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Configuration conf = spark.sessionState().newHadoopConf();
        MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());
        String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
        NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
        List<DataFile> files = listParreplacedion(parreplacedion, Util.uriToString(sourceTable.location()), format.get(), spec, conf, metricsConfig, nameMapping);
        AppendFiles append = targetTable.newAppend();
        files.forEach(append::appendFile);
        append.commit();
    } catch (NoSuchDatabaseException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", sourceTableIdent);
    } catch (NoSuchTableException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", sourceTableIdent);
    }
}

17 Source : SparkTableUtil.java
with Apache License 2.0
from apache

private static List<DataFile> listParquetParreplacedion(Map<String, String> parreplacedionPath, String parreplacedionUri, ParreplacedionSpec spec, Configuration conf, MetricsConfig metricsSpec, NameMapping mapping) {
    try {
        Path parreplacedion = new Path(parreplacedionUri);
        FileSystem fs = parreplacedion.getFileSystem(conf);
        return Arrays.stream(fs.listStatus(parreplacedion, HIDDEN_PATH_FILTER)).filter(FileStatus::isFile).map(stat -> {
            Metrics metrics;
            try {
                ParquetMetadata metadata = ParquetFileReader.readFooter(conf, stat);
                metrics = ParquetUtil.footerMetrics(metadata, Stream.empty(), metricsSpec, mapping);
            } catch (IOException e) {
                throw SparkExceptionUtil.toUncheckedException(e, "Unable to read the footer of the parquet file: %s", stat.getPath());
            }
            String parreplacedionKey = spec.fields().stream().map(ParreplacedionField::name).map(name -> String.format("%s=%s", name, parreplacedionPath.get(name))).collect(Collectors.joining("/"));
            return DataFiles.builder(spec).withPath(stat.getPath().toString()).withFormat("parquet").withFileSizeInBytes(stat.getLen()).withMetrics(metrics).withParreplacedionPath(parreplacedionKey).build();
        }).collect(Collectors.toList());
    } catch (IOException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in parreplacedion: %s", parreplacedionUri);
    }
}

17 Source : VectorizedParquetReader.java
with Apache License 2.0
from apache

public clreplaced VectorizedParquetReader<T> extends CloseableGroup implements CloseableIterable<T> {

    private final InputFile input;

    private final Schema expectedSchema;

    private final ParquetReadOptions options;

    private final Function<MessageType, VectorizedReader<?>> batchReaderFunc;

    private final Expression filter;

    private boolean reuseContainers;

    private final boolean caseSensitive;

    private final int batchSize;

    private final NameMapping nameMapping;

    public VectorizedParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options, Function<MessageType, VectorizedReader<?>> readerFunc, NameMapping nameMapping, Expression filter, boolean reuseContainers, boolean caseSensitive, int maxRecordsPerBatch) {
        this.input = input;
        this.expectedSchema = expectedSchema;
        this.options = options;
        this.batchReaderFunc = readerFunc;
        // replace alwaysTrue with null to avoid extra work evaluating a trivial filter
        this.filter = filter == Expressions.alwaysTrue() ? null : filter;
        this.reuseContainers = reuseContainers;
        this.caseSensitive = caseSensitive;
        this.batchSize = maxRecordsPerBatch;
        this.nameMapping = nameMapping;
    }

    private ReadConf conf = null;

    private ReadConf init() {
        if (conf == null) {
            ReadConf readConf = new ReadConf(input, options, expectedSchema, filter, null, batchReaderFunc, nameMapping, reuseContainers, caseSensitive, batchSize);
            this.conf = readConf.copy();
            return readConf;
        }
        return conf;
    }

    @Override
    public CloseableIterator<T> iterator() {
        FileIterator<T> iter = new FileIterator<>(init());
        addCloseable(iter);
        return iter;
    }

    private static clreplaced FileIterator<T> implements CloseableIterator<T> {

        private final ParquetFileReader reader;

        private final boolean[] shouldSkip;

        private final VectorizedReader<T> model;

        private final long totalValues;

        private final int batchSize;

        private final List<Map<ColumnPath, ColumnChunkMetaData>> columnChunkMetadata;

        private final boolean reuseContainers;

        private int nextRowGroup = 0;

        private long nextRowGroupStart = 0;

        private long valuesRead = 0;

        private T last = null;

        private final long[] rowGroupsStartRowPos;

        FileIterator(ReadConf conf) {
            this.reader = conf.reader();
            this.shouldSkip = conf.shouldSkip();
            this.totalValues = conf.totalValues();
            this.reuseContainers = conf.reuseContainers();
            this.model = conf.vectorizedModel();
            this.batchSize = conf.batchSize();
            this.model.setBatchSize(this.batchSize);
            this.columnChunkMetadata = conf.columnChunkMetadataForRowGroups();
            this.rowGroupsStartRowPos = conf.startRowPositions();
        }

        @Override
        public boolean hasNext() {
            return valuesRead < totalValues;
        }

        @Override
        public T next() {
            if (!hasNext()) {
                throw new NoSuchElementException();
            }
            if (valuesRead >= nextRowGroupStart) {
                advance();
            }
            // batchSize is an integer, so casting to integer is safe
            int numValuesToRead = (int) Math.min(nextRowGroupStart - valuesRead, batchSize);
            if (reuseContainers) {
                this.last = model.read(last, numValuesToRead);
            } else {
                this.last = model.read(null, numValuesToRead);
            }
            valuesRead += numValuesToRead;
            return last;
        }

        private void advance() {
            while (shouldSkip[nextRowGroup]) {
                nextRowGroup += 1;
                reader.skipNextRowGroup();
            }
            PageReadStore pages;
            try {
                pages = reader.readNextRowGroup();
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
            long rowPosition = rowGroupsStartRowPos[nextRowGroup];
            model.setRowGroupInfo(pages, columnChunkMetadata.get(nextRowGroup), rowPosition);
            nextRowGroupStart += pages.getRowCount();
            nextRowGroup += 1;
        }

        @Override
        public void close() throws IOException {
            model.close();
            reader.close();
        }
    }
}

17 Source : ParquetUtil.java
with Apache License 2.0
from apache

public static Metrics footerMetrics(ParquetMetadata metadata, Stream<FieldMetrics> fieldMetrics, MetricsConfig metricsConfig, NameMapping nameMapping) {
    long rowCount = 0;
    Map<Integer, Long> columnSizes = Maps.newHashMap();
    Map<Integer, Long> valueCounts = Maps.newHashMap();
    Map<Integer, Long> nullValueCounts = Maps.newHashMap();
    Map<Integer, Literal<?>> lowerBounds = Maps.newHashMap();
    Map<Integer, Literal<?>> upperBounds = Maps.newHashMap();
    Set<Integer> missingStats = Sets.newHashSet();
    // ignore metrics for fields we failed to determine reliable IDs
    MessageType parquetTypeWithIds = getParquetTypeWithIds(metadata, nameMapping);
    Schema fileSchema = ParquetSchemaUtil.convertAndPrune(parquetTypeWithIds);
    List<BlockMetaData> blocks = metadata.getBlocks();
    for (BlockMetaData block : blocks) {
        rowCount += block.getRowCount();
        for (ColumnChunkMetaData column : block.getColumns()) {
            Integer fieldId = fileSchema.aliasToId(column.getPath().toDotString());
            if (fieldId == null) {
                // fileSchema may contain a subset of columns present in the file
                // as we prune columns we could not replacedign ids
                continue;
            }
            increment(columnSizes, fieldId, column.getTotalSize());
            MetricsMode metricsMode = MetricsUtil.metricsMode(fileSchema, metricsConfig, fieldId);
            if (metricsMode == MetricsModes.None.get()) {
                continue;
            }
            increment(valueCounts, fieldId, column.getValueCount());
            Statistics stats = column.getStatistics();
            if (stats == null) {
                missingStats.add(fieldId);
            } else if (!stats.isEmpty()) {
                increment(nullValueCounts, fieldId, stats.getNumNulls());
                if (metricsMode != MetricsModes.Counts.get()) {
                    Types.NestedField field = fileSchema.findField(fieldId);
                    if (field != null && stats.hasNonNullValue() && shouldStoreBounds(column, fileSchema)) {
                        Literal<?> min = ParquetConversions.fromParquetPrimitive(field.type(), column.getPrimitiveType(), stats.genericGetMin());
                        updateMin(lowerBounds, fieldId, field.type(), min, metricsMode);
                        Literal<?> max = ParquetConversions.fromParquetPrimitive(field.type(), column.getPrimitiveType(), stats.genericGetMax());
                        updateMax(upperBounds, fieldId, field.type(), max, metricsMode);
                    }
                }
            }
        }
    }
    // discard acreplacedulated values if any stats were missing
    for (Integer fieldId : missingStats) {
        nullValueCounts.remove(fieldId);
        lowerBounds.remove(fieldId);
        upperBounds.remove(fieldId);
    }
    return new Metrics(rowCount, columnSizes, valueCounts, nullValueCounts, MetricsUtil.createNanValueCounts(fieldMetrics, metricsConfig, fileSchema), toBufferMap(fileSchema, lowerBounds), toBufferMap(fileSchema, upperBounds));
}

17 Source : ParquetReader.java
with Apache License 2.0
from apache

public clreplaced ParquetReader<T> extends CloseableGroup implements CloseableIterable<T> {

    private final InputFile input;

    private final Schema expectedSchema;

    private final ParquetReadOptions options;

    private final Function<MessageType, ParquetValueReader<?>> readerFunc;

    private final Expression filter;

    private final boolean reuseContainers;

    private final boolean caseSensitive;

    private final NameMapping nameMapping;

    public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options, Function<MessageType, ParquetValueReader<?>> readerFunc, NameMapping nameMapping, Expression filter, boolean reuseContainers, boolean caseSensitive) {
        this.input = input;
        this.expectedSchema = expectedSchema;
        this.options = options;
        this.readerFunc = readerFunc;
        // replace alwaysTrue with null to avoid extra work evaluating a trivial filter
        this.filter = filter == Expressions.alwaysTrue() ? null : filter;
        this.reuseContainers = reuseContainers;
        this.caseSensitive = caseSensitive;
        this.nameMapping = nameMapping;
    }

    private ReadConf<T> conf = null;

    private ReadConf<T> init() {
        if (conf == null) {
            ReadConf<T> readConf = new ReadConf<>(input, options, expectedSchema, filter, readerFunc, null, nameMapping, reuseContainers, caseSensitive, null);
            this.conf = readConf.copy();
            return readConf;
        }
        return conf;
    }

    @Override
    public CloseableIterator<T> iterator() {
        FileIterator<T> iter = new FileIterator<>(init());
        addCloseable(iter);
        return iter;
    }

    private static clreplaced FileIterator<T> implements CloseableIterator<T> {

        private final ParquetFileReader reader;

        private final boolean[] shouldSkip;

        private final ParquetValueReader<T> model;

        private final long totalValues;

        private final boolean reuseContainers;

        private final long[] rowGroupsStartRowPos;

        private int nextRowGroup = 0;

        private long nextRowGroupStart = 0;

        private long valuesRead = 0;

        private T last = null;

        FileIterator(ReadConf<T> conf) {
            this.reader = conf.reader();
            this.shouldSkip = conf.shouldSkip();
            this.model = conf.model();
            this.totalValues = conf.totalValues();
            this.reuseContainers = conf.reuseContainers();
            this.rowGroupsStartRowPos = conf.startRowPositions();
        }

        @Override
        public boolean hasNext() {
            return valuesRead < totalValues;
        }

        @Override
        public T next() {
            if (valuesRead >= nextRowGroupStart) {
                advance();
            }
            if (reuseContainers) {
                this.last = model.read(last);
            } else {
                this.last = model.read(null);
            }
            valuesRead += 1;
            return last;
        }

        private void advance() {
            while (shouldSkip[nextRowGroup]) {
                nextRowGroup += 1;
                reader.skipNextRowGroup();
            }
            PageReadStore pages;
            try {
                pages = reader.readNextRowGroup();
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
            long rowPosition = rowGroupsStartRowPos[nextRowGroup];
            nextRowGroupStart += pages.getRowCount();
            nextRowGroup += 1;
            model.setPageSource(pages, rowPosition);
        }

        @Override
        public void close() throws IOException {
            reader.close();
        }
    }
}

17 Source : TestSchemaAndMappingUpdate.java
with Apache License 2.0
from apache

@Test
public void testRenameColumn() {
    NameMapping mapping = MappingUtil.create(table.schema());
    String mappingJson = NameMappingParser.toJson(mapping);
    table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit();
    table.updateSchema().renameColumn("id", "object_id").commit();
    String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping updated = NameMappingParser.fromJson(updatedJson);
    int idColumnId = table.schema().findField("object_id").fieldId();
    validateUnchanged(Iterables.filter(mapping.asMappedFields().fields(), field -> !Objects.equals(idColumnId, field.id())), updated);
    MappedField updatedMapping = updated.find(idColumnId);
    replacedert.replacedertNotNull("Mapping for id column should exist", updatedMapping);
    replacedert.replacedertEquals("Should add the new column name to the existing mapping", MappedField.of(idColumnId, ImmutableList.of("id", "object_id")), updatedMapping);
}

17 Source : TestSchemaAndMappingUpdate.java
with Apache License 2.0
from apache

/**
 * replacederts that the fields in the original mapping are unchanged in the updated mapping.
 */
private void validateUnchanged(Iterable<MappedField> fields, NameMapping updated) {
    MappedFields updatedFields = updated.asMappedFields();
    for (MappedField field : fields) {
        replacedert.replacedertEquals("Existing fields should not change", field, updatedFields.field(field.id()));
    }
}

17 Source : TestSchemaAndMappingUpdate.java
with Apache License 2.0
from apache

/**
 * replacederts that the fields in the original mapping are unchanged in the updated mapping.
 */
private void validateUnchanged(NameMapping original, NameMapping updated) {
    MappedFields updatedFields = updated.asMappedFields();
    for (MappedField field : original.asMappedFields().fields()) {
        replacedert.replacedertEquals("Existing fields should not change", field, updatedFields.field(field.id()));
    }
}

17 Source : TestAvroNameMapping.java
with Apache License 2.0
from apache

private Record writeAndRead(Schema writeSchema, Schema readSchema, Record record, NameMapping nameMapping) throws IOException {
    File file = temp.newFile();
    // Write without file ids
    org.apache.avro.Schema writeAvroSchema = RemoveIds.removeIds(writeSchema);
    DatumWriter<Record> datumWriter = new GenericDatumWriter<>(writeAvroSchema);
    try (DataFileWriter<Record> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(writeAvroSchema, file);
        dataFileWriter.append(record);
    }
    Iterable<GenericData.Record> records = Avro.read(Files.localInput(file)).project(readSchema).withNameMapping(nameMapping).build();
    return Iterables.getOnlyElement(records);
}

17 Source : ProjectionDatumReader.java
with Apache License 2.0
from apache

public clreplaced ProjectionDatumReader<D> implements DatumReader<D>, SupportsRowPosition {

    private final Function<Schema, DatumReader<?>> getReader;

    private final org.apache.iceberg.Schema expectedSchema;

    private final Map<String, String> renames;

    private NameMapping nameMapping;

    private Schema readSchema = null;

    private Schema fileSchema = null;

    private DatumReader<D> wrapped = null;

    public ProjectionDatumReader(Function<Schema, DatumReader<?>> getReader, org.apache.iceberg.Schema expectedSchema, Map<String, String> renames, NameMapping nameMapping) {
        this.getReader = getReader;
        this.expectedSchema = expectedSchema;
        this.renames = renames;
        this.nameMapping = nameMapping;
    }

    @Override
    public void setRowPositionSupplier(Supplier<Long> posSupplier) {
        if (wrapped instanceof SupportsRowPosition) {
            ((SupportsRowPosition) wrapped).setRowPositionSupplier(posSupplier);
        }
    }

    @Override
    public void setSchema(Schema newFileSchema) {
        this.fileSchema = newFileSchema;
        if (nameMapping == null && !AvroSchemaUtil.hasIds(fileSchema)) {
            nameMapping = MappingUtil.create(expectedSchema);
        }
        Set<Integer> projectedIds = TypeUtil.getProjectedIds(expectedSchema);
        Schema prunedSchema = AvroSchemaUtil.pruneColumns(newFileSchema, projectedIds, nameMapping);
        this.readSchema = AvroSchemaUtil.buildAvroProjection(prunedSchema, expectedSchema, renames);
        this.wrapped = newDatumReader();
    }

    @Override
    public D read(D reuse, Decoder in) throws IOException {
        return wrapped.read(reuse, in);
    }

    @SuppressWarnings("unchecked")
    private DatumReader<D> newDatumReader() {
        DatumReader<D> reader = (DatumReader<D>) getReader.apply(readSchema);
        reader.setSchema(fileSchema);
        return reader;
    }
}

17 Source : AvroSchemaUtil.java
with Apache License 2.0
from apache

static Integer getFieldId(Schema.Field field, NameMapping nameMapping, Iterable<String> parentFieldNames) {
    Object id = field.getObjectProp(FIELD_ID_PROP);
    if (id != null) {
        return toInt(id);
    } else if (nameMapping != null) {
        List<String> names = Lists.newArrayList(parentFieldNames);
        names.add(field.name());
        MappedField mappedField = nameMapping.find(names);
        if (mappedField != null) {
            return mappedField.id();
        }
    }
    return null;
}

17 Source : AvroSchemaUtil.java
with Apache License 2.0
from apache

private static Integer getId(Schema schema, String propertyName, NameMapping nameMapping, List<String> names) {
    if (schema.getType() == UNION) {
        return getId(fromOption(schema), propertyName, nameMapping, names);
    }
    Object id = schema.getObjectProp(propertyName);
    if (id != null) {
        return toInt(id);
    } else if (nameMapping != null) {
        MappedField mappedField = nameMapping.find(names);
        if (mappedField != null) {
            return mappedField.id();
        }
    }
    return null;
}

16 Source : TestSparkTableUtilWithInMemoryCatalog.java
with Apache License 2.0
from apache

@Test
public void testImportTableWithMappingForNestedDataParreplacedionedTable() throws IOException {
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        Dataset<Row> df1 = spark.range(1, 2).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")).withColumn("data", functions.lit("Z"));
        Dataset<Row> df2 = spark.range(2, 3).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")).withColumn("data", functions.lit("Z"));
        df1.union(df2).coalesce(1).select("id", "extra_col", "struct", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).parreplacedionBy("data").saveAsTable("parquet_table");
        // don't include `extra_col` and `nested_2` on purpose
        Schema schema = new Schema(optional(1, "id", Types.LongType.get()), required(2, "struct", Types.StructType.of(required(4, "nested_1", Types.StringType.get()), required(5, "nested_3", Types.StringType.get()))), required(3, "data", Types.StringType.get()));
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(schema).idenreplacedy("data").build();
        Table table = TABLES.create(schema, spec, tableLocation);
        // replacedign a custom metrics config and a name mapping
        NameMapping nameMapping = MappingUtil.create(schema);
        table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full").set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
        File stagingDir = temp.newFolder("staging-dir");
        SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
        // validate we get the expected results back
        List<Row> expected = spark.table("parquet_table").select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
        List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
        replacedert.replacedertEquals("Rows must match", expected, actual);
        // validate we persisted correct metrics
        Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
        List<Row> bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList();
        replacedert.replacedertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size());
        replacedert.replacedertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size());
        Types.NestedField nestedField1 = table.schema().findField("struct.nested_1");
        checkFieldMetrics(fileDF, nestedField1, true);
        Types.NestedField id = table.schema().findField("id");
        checkFieldMetrics(fileDF, id, 1L, 2L);
        Types.NestedField nestedField3 = table.schema().findField("struct.nested_3");
        checkFieldMetrics(fileDF, nestedField3, "f", "g");
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}

16 Source : TestSparkTableUtilWithInMemoryCatalog.java
with Apache License 2.0
from apache

@Test
public void testImportTableWithMappingForNestedData() throws IOException {
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        Dataset<Row> df1 = spark.range(1, 2).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')"));
        Dataset<Row> df2 = spark.range(2, 3).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')"));
        df1.union(df2).coalesce(1).select("id", "extra_col", "struct").write().format("parquet").mode("append").option("path", parquetTableLocation).saveAsTable("parquet_table");
        // don't include `extra_col` and `nested_2` on purpose
        Schema schema = new Schema(optional(1, "id", Types.LongType.get()), required(2, "struct", Types.StructType.of(required(3, "nested_1", Types.StringType.get()), required(4, "nested_3", Types.StringType.get()))));
        Table table = TABLES.create(schema, ParreplacedionSpec.unparreplacedioned(), tableLocation);
        // replacedign a custom metrics config and a name mapping
        NameMapping nameMapping = MappingUtil.create(schema);
        table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full").set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
        File stagingDir = temp.newFolder("staging-dir");
        SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
        // validate we get the expected results back
        List<Row> expected = spark.table("parquet_table").select("id", "struct.nested_1", "struct.nested_3").collectAsList();
        List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "struct.nested_1", "struct.nested_3").collectAsList();
        replacedert.replacedertEquals("Rows must match", expected, actual);
        // validate we persisted correct metrics
        Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
        List<Row> bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList();
        replacedert.replacedertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size());
        replacedert.replacedertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size());
        Types.NestedField nestedField1 = table.schema().findField("struct.nested_1");
        checkFieldMetrics(fileDF, nestedField1, true);
        Types.NestedField id = table.schema().findField("id");
        checkFieldMetrics(fileDF, id, 1L, 2L);
        Types.NestedField nestedField3 = table.schema().findField("struct.nested_3");
        checkFieldMetrics(fileDF, nestedField3, "f", "g");
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}

16 Source : SparkTableUtil.java
with Apache License 2.0
from apache

/**
 * Import files from given parreplacedions to an Iceberg table.
 *
 * @param spark a Spark session
 * @param parreplacedions parreplacedions to import
 * @param targetTable an Iceberg table where to import the data
 * @param spec a parreplacedion spec
 * @param stagingDir a staging directory to store temporary manifest files
 */
public static void importSparkParreplacedions(SparkSession spark, List<SparkParreplacedion> parreplacedions, Table targetTable, ParreplacedionSpec spec, String stagingDir) {
    Configuration conf = spark.sessionState().newHadoopConf();
    SerializableConfiguration serializableConf = new SerializableConfiguration(conf);
    int parallelism = Math.min(parreplacedions.size(), spark.sessionState().conf().parallelParreplacedionDiscoveryParallelism());
    int numShuffleParreplacedions = spark.sessionState().conf().numShuffleParreplacedions();
    MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());
    String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
    JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
    JavaRDD<SparkParreplacedion> parreplacedionRDD = sparkContext.parallelize(parreplacedions, parallelism);
    Dataset<SparkParreplacedion> parreplacedionDS = spark.createDataset(parreplacedionRDD.rdd(), Encoders.javaSerialization(SparkParreplacedion.clreplaced));
    List<ManifestFile> manifests = parreplacedionDS.flatMap((FlatMapFunction<SparkParreplacedion, DataFile>) sparkParreplacedion -> listParreplacedion(sparkParreplacedion, spec, serializableConf, metricsConfig, nameMapping).iterator(), Encoders.javaSerialization(DataFile.clreplaced)).reparreplacedion(numShuffleParreplacedions).map((MapFunction<DataFile, Tuple2<String, DataFile>>) file -> Tuple2.apply(file.path().toString(), file), Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.clreplaced))).orderBy(col("_1")).mapParreplacedions((MapParreplacedionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), Encoders.javaSerialization(ManifestFile.clreplaced)).collectAsList();
    try {
        boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean(targetTable.properties(), TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);
        AppendFiles append = targetTable.newAppend();
        manifests.forEach(append::appendManifest);
        append.commit();
        if (!snapshotIdInheritanceEnabled) {
            // delete original manifests as they were rewritten before the commit
            deleteManifests(targetTable.io(), manifests);
        }
    } catch (Throwable e) {
        deleteManifests(targetTable.io(), manifests);
        throw e;
    }
}

16 Source : TestParquetSchemaUtil.java
with Apache License 2.0
from apache

@Test
public void testreplacedignIdsByNameMapping() {
    Types.StructType structType = Types.StructType.of(required(0, "id", Types.LongType.get()), optional(1, "list_of_maps", Types.ListType.ofOptional(2, Types.MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), optional(5, "map_of_lists", Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), required(9, "list_of_lists", Types.ListType.ofOptional(10, Types.ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), required(12, "map_of_maps", Types.MapType.ofOptional(13, 14, Types.StringType.get(), Types.MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), required(17, "list_of_struct_of_nested_types", Types.ListType.ofOptional(19, Types.StructType.of(Types.NestedField.required(20, "m1", Types.MapType.ofOptional(21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), Types.NestedField.optional(23, "l1", Types.ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), Types.NestedField.required(25, "l2", Types.ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), Types.NestedField.optional(27, "m2", Types.MapType.ofOptional(28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES))))));
    Schema schema = new Schema(TypeUtil.replacedignFreshIds(structType, new AtomicInteger(0)::incrementAndGet).replacedtructType().fields());
    NameMapping nameMapping = MappingUtil.create(schema);
    MessageType messageTypeWithIds = ParquetSchemaUtil.convert(schema, "parquet_type");
    MessageType messageTypeWithIdsFromNameMapping = ParquetSchemaUtil.applyNameMapping(RemoveIds.removeIds(messageTypeWithIds), nameMapping);
    replacedert.replacedertEquals(messageTypeWithIds, messageTypeWithIdsFromNameMapping);
}

16 Source : TestORCSchemaUtil.java
with Apache License 2.0
from apache

@Test
public void testreplacedignIdsByNameMapping() {
    Types.StructType structType = Types.StructType.of(required(0, "id", Types.LongType.get()), optional(1, "list_of_maps", Types.ListType.ofOptional(2, Types.MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), optional(5, "map_of_lists", Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), required(9, "list_of_lists", Types.ListType.ofOptional(10, Types.ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), required(12, "map_of_maps", Types.MapType.ofOptional(13, 14, Types.StringType.get(), Types.MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), required(17, "list_of_struct_of_nested_types", Types.ListType.ofOptional(19, Types.StructType.of(Types.NestedField.required(20, "m1", Types.MapType.ofOptional(21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), Types.NestedField.optional(23, "l1", Types.ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), Types.NestedField.required(25, "l2", Types.ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), Types.NestedField.optional(27, "m2", Types.MapType.ofOptional(28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES))))));
    Schema schema = new Schema(TypeUtil.replacedignFreshIds(structType, new AtomicInteger(0)::incrementAndGet).replacedtructType().fields());
    NameMapping nameMapping = MappingUtil.create(schema);
    TypeDescription typeDescriptionWithIds = ORCSchemaUtil.convert(schema);
    TypeDescription typeDescriptionWithIdsFromNameMapping = ORCSchemaUtil.applyNameMapping(ORCSchemaUtil.removeIds(typeDescriptionWithIds), nameMapping);
    replacedertTrue("TypeDescription schemas should be equal, including IDs", equalsWithIds(typeDescriptionWithIds, typeDescriptionWithIdsFromNameMapping));
}

16 Source : TestExpressionToSearchArgument.java
with Apache License 2.0
from apache

@Test
public void testModifiedSimpleSchemaNameMapping() {
    Schema originalSchema = new Schema(required(1, "int", Types.IntegerType.get()), optional(2, "long_to_be_dropped", Types.LongType.get()));
    Schema mappingSchema = new Schema(required(1, "int", Types.IntegerType.get()), optional(3, "new_float_field", Types.FloatType.get()));
    TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema));
    NameMapping nameMapping = MappingUtil.create(mappingSchema);
    TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(mappingSchema, ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping));
    Expression expr = equal("int", 1);
    Expression boundFilter = Binder.bind(mappingSchema.replacedtruct(), expr, true);
    SearchArgument expected = SearchArgumentFactory.newBuilder().equals("`int`", Type.LONG, 1L).build();
    SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
    replacedert.replacedertEquals(expected.toString(), actual.toString());
    // for columns not in the file, buildOrcProjection will append field names with _r<ID>
    // this will be preplaceded down to ORC, but ORC will handle such cases and return a TruthValue during evaluation
    expr = equal("new_float_field", 1);
    boundFilter = Binder.bind(mappingSchema.replacedtruct(), expr, true);
    expected = SearchArgumentFactory.newBuilder().equals("`new_float_field_r3`", Type.FLOAT, 1.0).build();
    actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
    replacedert.replacedertEquals(expected.toString(), actual.toString());
}

16 Source : TestExpressionToSearchArgument.java
with Apache License 2.0
from apache

@Test
public void testOriginalSchemaNameMapping() {
    Schema originalSchema = new Schema(required(1, "int", Types.IntegerType.get()), optional(2, "long", Types.LongType.get()));
    TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema));
    NameMapping nameMapping = MappingUtil.create(originalSchema);
    TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(originalSchema, ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping));
    Expression expr = and(equal("int", 1), equal("long", 1));
    Expression boundFilter = Binder.bind(originalSchema.replacedtruct(), expr, true);
    SearchArgument expected = SearchArgumentFactory.newBuilder().equals("`int`", Type.LONG, 1L).equals("`long`", Type.LONG, 1L).build();
    SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
    replacedert.replacedertEquals(expected.toString(), actual.toString());
}

16 Source : TestSchemaAndMappingUpdate.java
with Apache License 2.0
from apache

@Test
public void testAddStructColumn() {
    NameMapping mapping = MappingUtil.create(table.schema());
    String mappingJson = NameMappingParser.toJson(mapping);
    table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit();
    table.updateSchema().addColumn("location", Types.StructType.of(Types.NestedField.optional(1, "lat", Types.DoubleType.get()), Types.NestedField.optional(2, "long", Types.DoubleType.get()))).commit();
    String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping updated = NameMappingParser.fromJson(updatedJson);
    validateUnchanged(mapping, updated);
    MappedField newMapping = updated.find("location");
    replacedert.replacedertNotNull("Mapping for new column should be added", newMapping);
    replacedert.replacedertEquals("Mapping should use the replacedigned field ID", (Integer) table.schema().findField("location").fieldId(), updated.find("location").id());
    replacedert.replacedertNotNull("Should contain a nested mapping", updated.find("location").nestedMapping());
    replacedert.replacedertEquals("Mapping should use the replacedigned field ID", (Integer) table.schema().findField("location.lat").fieldId(), updated.find("location.lat").id());
    replacedert.replacedertNull("Should not contain a nested mapping", updated.find("location.lat").nestedMapping());
    replacedert.replacedertEquals("Mapping should use the replacedigned field ID", (Integer) table.schema().findField("location.long").fieldId(), updated.find("location.long").id());
    replacedert.replacedertNull("Should not contain a nested mapping", updated.find("location.long").nestedMapping());
}

16 Source : TestAvroNameMapping.java
with Apache License 2.0
from apache

@Test
public void testMapProjections() throws IOException {
    Schema writeSchema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get()), Types.NestedField.optional(5, "location", Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get()), Types.NestedField.optional(2, "long", Types.FloatType.get())))));
    Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table"));
    record.put("id", 34L);
    Record location = new Record(AvroSchemaUtil.fromOption(AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema()).getValueType()));
    location.put("lat", 52.995143f);
    location.put("long", -1.539054f);
    record.put("location", ImmutableMap.of("l1", location));
    // Table mapping does not project `location` map
    NameMapping nameMapping = MappingUtil.create(new Schema(Types.NestedField.required(0, "id", Types.LongType.get())));
    Schema readSchema = writeSchema;
    Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping);
    // field id 5 comes from read schema
    replacedert.replacedertNotNull("Field missing from table mapping is renamed", projected.getSchema().getField("location_r5"));
    replacedert.replacedertNull("location field should not be read", projected.get("location_r5"));
    replacedert.replacedertEquals(34L, projected.get("id"));
    // Table mapping partially project `location` map value
    nameMapping = MappingUtil.create(new Schema(Types.NestedField.required(0, "id", Types.LongType.get()), Types.NestedField.optional(5, "location", Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get()))))));
    projected = writeAndRead(writeSchema, readSchema, record, nameMapping);
    Record projectedL1 = ((Map<String, Record>) projected.get("location")).get("l1");
    replacedert.replacedertNotNull("Field missing from table mapping is renamed", projectedL1.getSchema().getField("long_r2"));
    replacedert.replacedertNull("location.value.long, should not be read", projectedL1.get("long_r2"));
}

16 Source : TestAvroNameMapping.java
with Apache License 2.0
from apache

@Test
public void testArrayProjections() throws Exception {
    Schema writeSchema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get()), Types.NestedField.optional(22, "point", Types.ListType.ofOptional(21, Types.StructType.of(Types.NestedField.required(19, "x", Types.IntegerType.get()), Types.NestedField.optional(18, "y", Types.IntegerType.get())))));
    Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table"));
    record.put("id", 34L);
    Record pointRecord = new Record(AvroSchemaUtil.fromOption(AvroSchemaUtil.fromOption(record.getSchema().getField("point").schema()).getElementType()));
    pointRecord.put("x", 1);
    pointRecord.put("y", 2);
    record.put("point", ImmutableList.of(pointRecord));
    NameMapping nameMapping = MappingUtil.create(new Schema(// Optional array field missing.
    Types.NestedField.required(0, "id", Types.LongType.get())));
    Schema readSchema = writeSchema;
    Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping);
    replacedert.replacedertNotNull("Field missing from table mapping is renamed", projected.getSchema().getField("point_r22"));
    replacedert.replacedertNull("point field is not projected", projected.get("point_r22"));
    replacedert.replacedertEquals(34L, projected.get("id"));
    // point array is partially projected
    nameMapping = MappingUtil.create(new Schema(Types.NestedField.required(0, "id", Types.LongType.get()), Types.NestedField.optional(22, "point", Types.ListType.ofOptional(21, Types.StructType.of(Types.NestedField.required(19, "x", Types.IntegerType.get()))))));
    projected = writeAndRead(writeSchema, readSchema, record, nameMapping);
    Record point = ((List<Record>) projected.get("point")).get(0);
    replacedert.replacedertNotNull("Field missing from table mapping is renamed", point.getSchema().getField("y_r18"));
    replacedert.replacedertEquals("point.x is projected", 1, point.get("x"));
    replacedert.replacedertNull("point.y is not projected", point.get("y_r18"));
    replacedert.replacedertEquals(34L, projected.get("id"));
}

15 Source : SparkTableUtil.java
with Apache License 2.0
from apache

private static List<DataFile> listOrcParreplacedion(Map<String, String> parreplacedionPath, String parreplacedionUri, ParreplacedionSpec spec, Configuration conf, MetricsConfig metricsSpec, NameMapping mapping) {
    try {
        Path parreplacedion = new Path(parreplacedionUri);
        FileSystem fs = parreplacedion.getFileSystem(conf);
        return Arrays.stream(fs.listStatus(parreplacedion, HIDDEN_PATH_FILTER)).filter(FileStatus::isFile).map(stat -> {
            Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf), metricsSpec, mapping);
            String parreplacedionKey = spec.fields().stream().map(ParreplacedionField::name).map(name -> String.format("%s=%s", name, parreplacedionPath.get(name))).collect(Collectors.joining("/"));
            return DataFiles.builder(spec).withPath(stat.getPath().toString()).withFormat("orc").withFileSizeInBytes(stat.getLen()).withMetrics(metrics).withParreplacedionPath(parreplacedionKey).build();
        }).collect(Collectors.toList());
    } catch (IOException e) {
        throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in parreplacedion: %s", parreplacedionUri);
    }
}

15 Source : TestExpressionToSearchArgument.java
with Apache License 2.0
from apache

@Test
public void testModifiedComplexSchemaNameMapping() {
    Schema originalSchema = new Schema(optional(1, "struct", Types.StructType.of(required(2, "long", Types.LongType.get()))), optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), optional(5, "map", Types.MapType.ofRequired(6, 7, Types.LongType.get(), Types.LongType.get())), optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of(required(10, "long", Types.LongType.get())))), optional(11, "listOfPeople", Types.ListType.ofRequired(12, Types.StructType.of(required(13, "name", Types.StringType.get()), required(14, "birth_date", Types.DateType.get())))));
    Schema mappingSchema = new Schema(optional(1, "struct", Types.StructType.of(required(2, "int", Types.LongType.get()))), optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), optional(5, "newMap", Types.MapType.ofRequired(6, 7, Types.StringType.get(), Types.LongType.get())), optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of(required(10, "newLong", Types.LongType.get())))), optional(11, "listOfPeople", Types.ListType.ofRequired(12, Types.StructType.of(required(13, "name", Types.StringType.get()), required(14, "age", Types.IntegerType.get())))));
    TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema));
    NameMapping nameMapping = MappingUtil.create(mappingSchema);
    TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(mappingSchema, ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping));
    Expression expr = and(and(equal("struct.int", 1), and(lessThanOrEqual("list.element", 5), equal("newMap.key", "country")), and(equal("listOfStruct.newLong", 100L), notEqual("listOfPeople.name", "Bob"))), lessThan("listOfPeople.age", 30));
    Expression boundFilter = Binder.bind(mappingSchema.replacedtruct(), expr, true);
    SearchArgument expected = SearchArgumentFactory.newBuilder().startAnd().equals("`struct`.`int_r2`", Type.LONG, 1L).lessThanEquals("`list`.`_elem`", Type.LONG, 5L).equals("`newMap_r5`.`_key`", Type.STRING, "country").equals("`listOfStruct`.`_elem`.`newLong_r10`", Type.LONG, 100L).startOr().isNull("`listOfPeople`.`_elem`.`name`", Type.STRING).startNot().equals("`listOfPeople`.`_elem`.`name`", Type.STRING, "Bob").end().end().lessThan("`listOfPeople`.`_elem`.`age_r14`", Type.LONG, 30L).end().build();
    SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
    replacedert.replacedertEquals(expected.toString(), actual.toString());
}

15 Source : TestSchemaAndMappingUpdate.java
with Apache License 2.0
from apache

@Test
public void testAddPrimitiveColumn() {
    NameMapping mapping = MappingUtil.create(table.schema());
    String mappingJson = NameMappingParser.toJson(mapping);
    table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit();
    table.updateSchema().addColumn("count", Types.LongType.get()).commit();
    String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
    NameMapping updated = NameMappingParser.fromJson(updatedJson);
    validateUnchanged(mapping, updated);
    MappedField newMapping = updated.find("count");
    replacedert.replacedertNotNull("Mapping for new column should be added", newMapping);
    replacedert.replacedertEquals("Mapping should use the replacedigned field ID", (Integer) table.schema().findField("count").fieldId(), updated.find("count").id());
    replacedert.replacedertNull("Should not contain a nested mapping", updated.find("count").nestedMapping());
}

15 Source : TestAvroNameMapping.java
with Apache License 2.0
from apache

@Test
public void testComplexMapKeys() throws IOException {
    Schema writeSchema = new Schema(Types.NestedField.required(5, "location", Types.MapType.ofRequired(6, 7, Types.StructType.of(Types.NestedField.required(3, "k1", Types.StringType.get()), Types.NestedField.required(4, "k2", Types.StringType.get())), Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get()), Types.NestedField.optional(2, "long", Types.FloatType.get())))));
    Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table"));
    org.apache.avro.Schema locationSchema = record.getSchema().getField("location").schema();
    Record locationElement = new Record(locationSchema.getElementType());
    Record locationKey = new Record(locationElement.getSchema().getField("key").schema());
    Record locationValue = new Record(locationElement.getSchema().getField("value").schema());
    locationKey.put("k1", "k1");
    locationKey.put("k2", "k2");
    locationValue.put("lat", 52.995143f);
    locationValue.put("long", -1.539054f);
    locationElement.put("key", locationKey);
    locationElement.put("value", locationValue);
    record.put("location", ImmutableList.of(locationElement));
    // project a subset of the map's value columns in NameMapping
    NameMapping nameMapping = MappingUtil.create(new Schema(Types.NestedField.required(5, "location", Types.MapType.ofOptional(6, 7, Types.StructType.of(Types.NestedField.required(3, "k1", Types.StringType.get()), Types.NestedField.optional(4, "k2", Types.StringType.get())), Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get()))))));
    Schema readSchema = new Schema(Types.NestedField.required(5, "location", Types.MapType.ofOptional(6, 7, Types.StructType.of(Types.NestedField.required(3, "k1", Types.StringType.get()), Types.NestedField.optional(4, "k2", Types.StringType.get())), Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get()), Types.NestedField.optional(2, "long", Types.FloatType.get())))));
    Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping);
    // The data is read back as a map
    Map<Record, Record> projectedLocation = (Map<Record, Record>) projected.get("location");
    Record projectedKey = projectedLocation.keySet().iterator().next();
    Record projectedValue = projectedLocation.values().iterator().next();
    replacedert.replacedertEquals(0, Comparators.charSequences().compare("k1", (CharSequence) projectedKey.get("k1")));
    replacedert.replacedertEquals(0, Comparators.charSequences().compare("k2", (CharSequence) projectedKey.get("k2")));
    replacedert.replacedertEquals(52.995143f, projectedValue.get("lat"));
    replacedert.replacedertNotNull(projectedValue.getSchema().getField("long_r2"));
    replacedert.replacedertNull(projectedValue.get("long_r2"));
}

See More Examples