org.apache.iceberg.hadoop.HadoopTables

Here are the examples of the java api org.apache.iceberg.hadoop.HadoopTables taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

96 Examples 7

19 Source : SparkCatalog.java
with Apache License 2.0
from apache

/**
 * A Spark TableCatalog implementation that wraps an Iceberg {@link Catalog}.
 * <p>
 * This supports the following catalog configuration options:
 * <ul>
 *   <li><tt>type</tt> - catalog type, "hive" or "hadoop"</li>
 *   <li><tt>uri</tt> - the Hive Metastore URI (Hive catalog only)</li>
 *   <li><tt>warehouse</tt> - the warehouse path (Hadoop catalog only)</li>
 *   <li><tt>default-namespace</tt> - a namespace to use as the default</li>
 * </ul>
 * <p>
 * To use a custom catalog that is not a Hive or Hadoop catalog, extend this clreplaced and override
 * {@link #buildIcebergCatalog(String, CaseInsensitiveStringMap)}.
 */
public clreplaced SparkCatalog extends BaseCatalog {

    private static final Set<String> DEFAULT_NS_KEYS = ImmutableSet.of(TableCatalog.PROP_OWNER);

    private String catalogName = null;

    private Catalog icebergCatalog = null;

    private boolean cacheEnabled = true;

    private SupportsNamespaces asNamespaceCatalog = null;

    private String[] defaultNamespace = null;

    private HadoopTables tables;

    /**
     * Build an Iceberg {@link Catalog} to be used by this Spark catalog adapter.
     *
     * @param name Spark's catalog name
     * @param options Spark's catalog options
     * @return an Iceberg catalog
     */
    protected Catalog buildIcebergCatalog(String name, CaseInsensitiveStringMap options) {
        Configuration conf = SparkSession.active().sessionState().newHadoopConf();
        Map<String, String> optionsMap = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
        optionsMap.putAll(options);
        return CatalogUtil.buildIcebergCatalog(name, optionsMap, conf);
    }

    /**
     * Build an Iceberg {@link TableIdentifier} for the given Spark identifier.
     *
     * @param identifier Spark's identifier
     * @return an Iceberg identifier
     */
    protected TableIdentifier buildIdentifier(Identifier identifier) {
        return Spark3Util.identifierToTableIdentifier(identifier);
    }

    @Override
    public SparkTable loadTable(Identifier ident) throws NoSuchTableException {
        try {
            Table icebergTable = load(ident);
            return new SparkTable(icebergTable, !cacheEnabled);
        } catch (org.apache.iceberg.exceptions.NoSuchTableException e) {
            throw new NoSuchTableException(ident);
        }
    }

    @Override
    public SparkTable createTable(Identifier ident, StructType schema, Transform[] transforms, Map<String, String> properties) throws TableAlreadyExistsException {
        Schema icebergSchema = SparkSchemaUtil.convert(schema);
        try {
            Catalog.TableBuilder builder = newBuilder(ident, icebergSchema);
            Table icebergTable = builder.withParreplacedionSpec(Spark3Util.toParreplacedionSpec(icebergSchema, transforms)).withLocation(properties.get("location")).withProperties(Spark3Util.rebuildCreateProperties(properties)).create();
            return new SparkTable(icebergTable, !cacheEnabled);
        } catch (AlreadyExistsException e) {
            throw new TableAlreadyExistsException(ident);
        }
    }

    @Override
    public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] transforms, Map<String, String> properties) throws TableAlreadyExistsException {
        Schema icebergSchema = SparkSchemaUtil.convert(schema);
        try {
            Catalog.TableBuilder builder = newBuilder(ident, icebergSchema);
            Transaction transaction = builder.withParreplacedionSpec(Spark3Util.toParreplacedionSpec(icebergSchema, transforms)).withLocation(properties.get("location")).withProperties(Spark3Util.rebuildCreateProperties(properties)).createTransaction();
            return new StagedSparkTable(transaction);
        } catch (AlreadyExistsException e) {
            throw new TableAlreadyExistsException(ident);
        }
    }

    @Override
    public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] transforms, Map<String, String> properties) throws NoSuchTableException {
        Schema icebergSchema = SparkSchemaUtil.convert(schema);
        try {
            Catalog.TableBuilder builder = newBuilder(ident, icebergSchema);
            Transaction transaction = builder.withParreplacedionSpec(Spark3Util.toParreplacedionSpec(icebergSchema, transforms)).withLocation(properties.get("location")).withProperties(Spark3Util.rebuildCreateProperties(properties)).replaceTransaction();
            return new StagedSparkTable(transaction);
        } catch (org.apache.iceberg.exceptions.NoSuchTableException e) {
            throw new NoSuchTableException(ident);
        }
    }

    @Override
    public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] transforms, Map<String, String> properties) {
        Schema icebergSchema = SparkSchemaUtil.convert(schema);
        Catalog.TableBuilder builder = newBuilder(ident, icebergSchema);
        Transaction transaction = builder.withParreplacedionSpec(Spark3Util.toParreplacedionSpec(icebergSchema, transforms)).withLocation(properties.get("location")).withProperties(Spark3Util.rebuildCreateProperties(properties)).createOrReplaceTransaction();
        return new StagedSparkTable(transaction);
    }

    @Override
    public SparkTable alterTable(Identifier ident, TableChange... changes) throws NoSuchTableException {
        SetProperty setLocation = null;
        SetProperty setSnapshotId = null;
        SetProperty pickSnapshotId = null;
        List<TableChange> propertyChanges = Lists.newArrayList();
        List<TableChange> schemaChanges = Lists.newArrayList();
        for (TableChange change : changes) {
            if (change instanceof SetProperty) {
                SetProperty set = (SetProperty) change;
                if (TableCatalog.PROP_LOCATION.equalsIgnoreCase(set.property())) {
                    setLocation = set;
                } else if ("current-snapshot-id".equalsIgnoreCase(set.property())) {
                    setSnapshotId = set;
                } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.property())) {
                    pickSnapshotId = set;
                } else {
                    propertyChanges.add(set);
                }
            } else if (change instanceof RemoveProperty) {
                propertyChanges.add(change);
            } else if (change instanceof ColumnChange) {
                schemaChanges.add(change);
            } else {
                throw new UnsupportedOperationException("Cannot apply unknown table change: " + change);
            }
        }
        try {
            Table table = load(ident);
            commitChanges(table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges);
        } catch (org.apache.iceberg.exceptions.NoSuchTableException e) {
            throw new NoSuchTableException(ident);
        }
        return null;
    }

    @Override
    public boolean dropTable(Identifier ident) {
        try {
            return isPathIdentifier(ident) ? tables.dropTable(((PathIdentifier) ident).location()) : icebergCatalog.dropTable(buildIdentifier(ident));
        } catch (org.apache.iceberg.exceptions.NoSuchTableException e) {
            return false;
        }
    }

    @Override
    public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException {
        try {
            checkNotPathIdentifier(from, "renameTable");
            checkNotPathIdentifier(to, "renameTable");
            icebergCatalog.renameTable(buildIdentifier(from), buildIdentifier(to));
        } catch (org.apache.iceberg.exceptions.NoSuchTableException e) {
            throw new NoSuchTableException(from);
        } catch (AlreadyExistsException e) {
            throw new TableAlreadyExistsException(to);
        }
    }

    @Override
    public void invalidateTable(Identifier ident) {
        try {
            load(ident).refresh();
        } catch (org.apache.iceberg.exceptions.NoSuchTableException ignored) {
        // ignore if the table doesn't exist, it is not cached
        }
    }

    @Override
    public Identifier[] listTables(String[] namespace) {
        return icebergCatalog.listTables(Namespace.of(namespace)).stream().map(ident -> Identifier.of(ident.namespace().levels(), ident.name())).toArray(Identifier[]::new);
    }

    @Override
    public String[] defaultNamespace() {
        if (defaultNamespace != null) {
            return defaultNamespace;
        }
        return new String[0];
    }

    @Override
    public String[][] listNamespaces() {
        if (asNamespaceCatalog != null) {
            return asNamespaceCatalog.listNamespaces().stream().map(Namespace::levels).toArray(String[][]::new);
        }
        return new String[0][];
    }

    @Override
    public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceException {
        if (asNamespaceCatalog != null) {
            try {
                return asNamespaceCatalog.listNamespaces(Namespace.of(namespace)).stream().map(Namespace::levels).toArray(String[][]::new);
            } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) {
                throw new NoSuchNamespaceException(namespace);
            }
        }
        throw new NoSuchNamespaceException(namespace);
    }

    @Override
    public Map<String, String> loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException {
        if (asNamespaceCatalog != null) {
            try {
                return asNamespaceCatalog.loadNamespaceMetadata(Namespace.of(namespace));
            } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) {
                throw new NoSuchNamespaceException(namespace);
            }
        }
        throw new NoSuchNamespaceException(namespace);
    }

    @Override
    public void createNamespace(String[] namespace, Map<String, String> metadata) throws NamespaceAlreadyExistsException {
        if (asNamespaceCatalog != null) {
            try {
                if (asNamespaceCatalog instanceof HadoopCatalog && DEFAULT_NS_KEYS.equals(metadata.keySet())) {
                    // Hadoop catalog will reject metadata properties, but Spark automatically adds "owner".
                    // If only the automatic properties are present, replace metadata with an empty map.
                    asNamespaceCatalog.createNamespace(Namespace.of(namespace), ImmutableMap.of());
                } else {
                    asNamespaceCatalog.createNamespace(Namespace.of(namespace), metadata);
                }
            } catch (AlreadyExistsException e) {
                throw new NamespaceAlreadyExistsException(namespace);
            }
        } else {
            throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + catalogName);
        }
    }

    @Override
    public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException {
        if (asNamespaceCatalog != null) {
            Map<String, String> updates = Maps.newHashMap();
            Set<String> removals = Sets.newHashSet();
            for (NamespaceChange change : changes) {
                if (change instanceof NamespaceChange.SetProperty) {
                    NamespaceChange.SetProperty set = (NamespaceChange.SetProperty) change;
                    updates.put(set.property(), set.value());
                } else if (change instanceof NamespaceChange.RemoveProperty) {
                    removals.add(((NamespaceChange.RemoveProperty) change).property());
                } else {
                    throw new UnsupportedOperationException("Cannot apply unknown namespace change: " + change);
                }
            }
            try {
                if (!updates.isEmpty()) {
                    asNamespaceCatalog.setProperties(Namespace.of(namespace), updates);
                }
                if (!removals.isEmpty()) {
                    asNamespaceCatalog.removeProperties(Namespace.of(namespace), removals);
                }
            } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) {
                throw new NoSuchNamespaceException(namespace);
            }
        } else {
            throw new NoSuchNamespaceException(namespace);
        }
    }

    @Override
    public boolean dropNamespace(String[] namespace) throws NoSuchNamespaceException {
        if (asNamespaceCatalog != null) {
            try {
                return asNamespaceCatalog.dropNamespace(Namespace.of(namespace));
            } catch (org.apache.iceberg.exceptions.NoSuchNamespaceException e) {
                throw new NoSuchNamespaceException(namespace);
            }
        }
        return false;
    }

    @Override
    public final void initialize(String name, CaseInsensitiveStringMap options) {
        this.cacheEnabled = Boolean.parseBoolean(options.getOrDefault("cache-enabled", "true"));
        Catalog catalog = buildIcebergCatalog(name, options);
        this.catalogName = name;
        this.tables = new HadoopTables(SparkSession.active().sessionState().newHadoopConf());
        this.icebergCatalog = cacheEnabled ? CachingCatalog.wrap(catalog) : catalog;
        if (catalog instanceof SupportsNamespaces) {
            this.asNamespaceCatalog = (SupportsNamespaces) catalog;
            if (options.containsKey("default-namespace")) {
                this.defaultNamespace = Splitter.on('.').splitToList(options.get("default-namespace")).toArray(new String[0]);
            }
        }
    }

    @Override
    public String name() {
        return catalogName;
    }

    private static void commitChanges(Table table, SetProperty setLocation, SetProperty setSnapshotId, SetProperty pickSnapshotId, List<TableChange> propertyChanges, List<TableChange> schemaChanges) {
        // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing
        // one order leads to different results
        Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, "Cannot set the current the current snapshot ID and cherry-pick snapshot changes");
        if (setSnapshotId != null) {
            long newSnapshotId = Long.parseLong(setSnapshotId.value());
            table.manageSnapshots().setCurrentSnapshot(newSnapshotId).commit();
        }
        // if updating the table snapshot, perform that update first in case it fails
        if (pickSnapshotId != null) {
            long newSnapshotId = Long.parseLong(pickSnapshotId.value());
            table.manageSnapshots().cherrypick(newSnapshotId).commit();
        }
        Transaction transaction = table.newTransaction();
        if (setLocation != null) {
            transaction.updateLocation().setLocation(setLocation.value()).commit();
        }
        if (!propertyChanges.isEmpty()) {
            Spark3Util.applyPropertyChanges(transaction.updateProperties(), propertyChanges).commit();
        }
        if (!schemaChanges.isEmpty()) {
            Spark3Util.applySchemaChanges(transaction.updateSchema(), schemaChanges).commit();
        }
        transaction.commitTransaction();
    }

    private static boolean isPathIdentifier(Identifier ident) {
        return ident instanceof PathIdentifier;
    }

    private static void checkNotPathIdentifier(Identifier identifier, String method) {
        if (identifier instanceof PathIdentifier) {
            throw new IllegalArgumentException(String.format("Cannot preplaced path based identifier to %s method. %s is a path.", method, identifier));
        }
    }

    private Table load(Identifier ident) {
        return isPathIdentifier(ident) ? tables.load(((PathIdentifier) ident).location()) : icebergCatalog.loadTable(buildIdentifier(ident));
    }

    private Catalog.TableBuilder newBuilder(Identifier ident, Schema schema) {
        return isPathIdentifier(ident) ? tables.buildTable(((PathIdentifier) ident).location(), schema) : icebergCatalog.buildTable(buildIdentifier(ident), schema);
    }
}

19 Source : TestSparkTableUtilWithInMemoryCatalog.java
with Apache License 2.0
from apache

public clreplaced TestSparkTableUtilWithInMemoryCatalog {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    private static final Schema SCHEMA = new Schema(optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()));

    private static final ParreplacedionSpec SPEC = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();

    private static SparkSession spark;

    @BeforeClreplaced
    public static void startSpark() {
        TestSparkTableUtilWithInMemoryCatalog.spark = SparkSession.builder().master("local[2]").getOrCreate();
    }

    @AfterClreplaced
    public static void stopSpark() {
        SparkSession currentSpark = TestSparkTableUtilWithInMemoryCatalog.spark;
        TestSparkTableUtilWithInMemoryCatalog.spark = null;
        currentSpark.stop();
    }

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private String tableLocation = null;

    @Before
    public void setupTableLocation() throws Exception {
        File tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
    }

    @Test
    public void testImportUnparreplacedionedTable() throws IOException {
        Map<String, String> props = Maps.newHashMap();
        props.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
        props.put(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "data", "full");
        Table table = TABLES.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), props, tableLocation);
        List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.clreplaced).coalesce(1);
            inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).saveAsTable("parquet_table");
            File stagingDir = temp.newFolder("staging-dir");
            SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
            List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
            replacedert.replacedertEquals("Result rows should match", records, actualRecords);
            Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
            Types.NestedField idField = table.schema().findField("id");
            checkFieldMetrics(fileDF, idField, true);
            Types.NestedField dataField = table.schema().findField("data");
            checkFieldMetrics(fileDF, dataField, false);
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    @Test
    public void testImportParreplacedionedTable() throws IOException {
        Map<String, String> props = Maps.newHashMap();
        props.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
        props.put(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "data", "full");
        Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
        List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.clreplaced);
            inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).parreplacedionBy("data").saveAsTable("parquet_table");
            replacedert.replacedertEquals("Should have 3 parreplacedions", 3, SparkTableUtil.getParreplacedions(spark, "parquet_table").size());
            replacedert.replacedertEquals("Should have 1 parreplacedion where data = 'a'", 1, SparkTableUtil.getParreplacedionsByFilter(spark, "parquet_table", "data = 'a'").size());
            File stagingDir = temp.newFolder("staging-dir");
            SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
            List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
            replacedert.replacedertEquals("Result rows should match", records, actualRecords);
            Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
            Types.NestedField idField = table.schema().findField("id");
            checkFieldMetrics(fileDF, idField, true);
            // 'data' is a parreplacedion column and is not physically present in files written by Spark
            Types.NestedField dataField = table.schema().findField("data");
            checkFieldMetrics(fileDF, dataField, true);
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    @Test
    public void testImportParreplacedions() throws IOException {
        Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
        List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.clreplaced);
            inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).parreplacedionBy("data").saveAsTable("parquet_table");
            File stagingDir = temp.newFolder("staging-dir");
            List<SparkParreplacedion> parreplacedions = SparkTableUtil.getParreplacedionsByFilter(spark, "parquet_table", "data = 'a'");
            SparkTableUtil.importSparkParreplacedions(spark, parreplacedions, table, table.spec(), stagingDir.toString());
            List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
            List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
            replacedert.replacedertEquals("Result rows should match", expectedRecords, actualRecords);
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    @Test
    public void testImportParreplacedionsWithSnapshotInheritance() throws IOException {
        Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
        table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit();
        List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.clreplaced);
            inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).parreplacedionBy("data").saveAsTable("parquet_table");
            File stagingDir = temp.newFolder("staging-dir");
            List<SparkParreplacedion> parreplacedions = SparkTableUtil.getParreplacedionsByFilter(spark, "parquet_table", "data = 'a'");
            SparkTableUtil.importSparkParreplacedions(spark, parreplacedions, table, table.spec(), stagingDir.toString());
            List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
            List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
            replacedert.replacedertEquals("Result rows should match", expectedRecords, actualRecords);
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    @Test
    public void testImportTableWithMappingForNestedData() throws IOException {
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            Dataset<Row> df1 = spark.range(1, 2).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')"));
            Dataset<Row> df2 = spark.range(2, 3).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')"));
            df1.union(df2).coalesce(1).select("id", "extra_col", "struct").write().format("parquet").mode("append").option("path", parquetTableLocation).saveAsTable("parquet_table");
            // don't include `extra_col` and `nested_2` on purpose
            Schema schema = new Schema(optional(1, "id", Types.LongType.get()), required(2, "struct", Types.StructType.of(required(3, "nested_1", Types.StringType.get()), required(4, "nested_3", Types.StringType.get()))));
            Table table = TABLES.create(schema, ParreplacedionSpec.unparreplacedioned(), tableLocation);
            // replacedign a custom metrics config and a name mapping
            NameMapping nameMapping = MappingUtil.create(schema);
            table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full").set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
            File stagingDir = temp.newFolder("staging-dir");
            SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
            // validate we get the expected results back
            List<Row> expected = spark.table("parquet_table").select("id", "struct.nested_1", "struct.nested_3").collectAsList();
            List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "struct.nested_1", "struct.nested_3").collectAsList();
            replacedert.replacedertEquals("Rows must match", expected, actual);
            // validate we persisted correct metrics
            Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
            List<Row> bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList();
            replacedert.replacedertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size());
            replacedert.replacedertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size());
            Types.NestedField nestedField1 = table.schema().findField("struct.nested_1");
            checkFieldMetrics(fileDF, nestedField1, true);
            Types.NestedField id = table.schema().findField("id");
            checkFieldMetrics(fileDF, id, 1L, 2L);
            Types.NestedField nestedField3 = table.schema().findField("struct.nested_3");
            checkFieldMetrics(fileDF, nestedField3, "f", "g");
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    @Test
    public void testImportTableWithMappingForNestedDataParreplacedionedTable() throws IOException {
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            Dataset<Row> df1 = spark.range(1, 2).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")).withColumn("data", functions.lit("Z"));
            Dataset<Row> df2 = spark.range(2, 3).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")).withColumn("data", functions.lit("Z"));
            df1.union(df2).coalesce(1).select("id", "extra_col", "struct", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).parreplacedionBy("data").saveAsTable("parquet_table");
            // don't include `extra_col` and `nested_2` on purpose
            Schema schema = new Schema(optional(1, "id", Types.LongType.get()), required(2, "struct", Types.StructType.of(required(4, "nested_1", Types.StringType.get()), required(5, "nested_3", Types.StringType.get()))), required(3, "data", Types.StringType.get()));
            ParreplacedionSpec spec = ParreplacedionSpec.builderFor(schema).idenreplacedy("data").build();
            Table table = TABLES.create(schema, spec, tableLocation);
            // replacedign a custom metrics config and a name mapping
            NameMapping nameMapping = MappingUtil.create(schema);
            table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full").set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
            File stagingDir = temp.newFolder("staging-dir");
            SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
            // validate we get the expected results back
            List<Row> expected = spark.table("parquet_table").select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
            List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
            replacedert.replacedertEquals("Rows must match", expected, actual);
            // validate we persisted correct metrics
            Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
            List<Row> bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList();
            replacedert.replacedertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size());
            replacedert.replacedertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size());
            Types.NestedField nestedField1 = table.schema().findField("struct.nested_1");
            checkFieldMetrics(fileDF, nestedField1, true);
            Types.NestedField id = table.schema().findField("id");
            checkFieldMetrics(fileDF, id, 1L, 2L);
            Types.NestedField nestedField3 = table.schema().findField("struct.nested_3");
            checkFieldMetrics(fileDF, nestedField3, "f", "g");
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    @Test
    public void testImportTableWithInt96Timestamp() throws IOException {
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            spark.conf().set(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE().key(), "INT96");
            Column timestampColumn = functions.to_timestamp(functions.lit("2010-03-20 10:40:30.1234"));
            Dataset<Row> df = spark.range(1, 10).withColumn("tmp_col", timestampColumn);
            df.coalesce(1).select("id", "tmp_col").write().format("parquet").mode("append").option("path", parquetTableLocation).saveAsTable("parquet_table");
            Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "tmp_col", Types.TimestampType.withZone()));
            Table table = TABLES.create(schema, ParreplacedionSpec.unparreplacedioned(), tableLocation);
            // replacedign a custom metrics config
            table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full").commit();
            File stagingDir = temp.newFolder("staging-dir");
            SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
            // validate we get the expected results back
            List<Row> expected = spark.table("parquet_table").select("id", "tmp_col").collectAsList();
            List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "tmp_col").collectAsList();
            replacedert.replacedertEquals("Rows must match", expected, actual);
            // validate we did not persist metrics for INT96
            Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
            Types.NestedField timestampField = table.schema().findField("tmp_col");
            checkFieldMetrics(fileDF, timestampField, true);
            Types.NestedField idField = table.schema().findField("id");
            checkFieldMetrics(fileDF, idField, 1L, 9L);
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    private void checkFieldMetrics(Dataset<Row> fileDF, Types.NestedField field, Object min, Object max) {
        List<Row> metricRows = fileDF.selectExpr(String.format("lower_bounds['%d']", field.fieldId()), String.format("upper_bounds['%d']", field.fieldId())).collectAsList();
        // we compare string representations not to deal with HeapCharBuffers for strings
        Object actualMin = Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(0)));
        replacedert.replacedertEquals("Min value should match", min.toString(), actualMin.toString());
        Object actualMax = Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(1)));
        replacedert.replacedertEquals("Max value should match", max.toString(), actualMax.toString());
    }

    private void checkFieldMetrics(Dataset<Row> fileDF, Types.NestedField field, boolean isNull) {
        List<Row> metricRows = fileDF.selectExpr(String.format("lower_bounds['%d']", field.fieldId()), String.format("upper_bounds['%d']", field.fieldId())).collectAsList();
        metricRows.forEach(row -> {
            replacedert.replacedertEquals("Invalid metrics for column: " + field.name(), isNull, row.isNullAt(0));
            replacedert.replacedertEquals("Invalid metrics for column: " + field.name(), isNull, row.isNullAt(1));
        });
    }
}

19 Source : TestCatalog.java
with Apache License 2.0
from apache

public clreplaced TestCatalog implements Catalog, Configurable {

    private HadoopTables tables;

    private Configuration conf;

    private String warehouse;

    public TestCatalog() {
    }

    @Override
    public String name() {
        return "test-tables";
    }

    private String tablePath(TableIdentifier identifier) {
        return String.format("%s/%s", warehouse, identifier.name());
    }

    @Override
    public List<TableIdentifier> listTables(Namespace namespace) {
        throw new UnsupportedOperationException();
    }

    @Override
    public Table createTable(TableIdentifier identifier, Schema schema, ParreplacedionSpec spec, String location, Map<String, String> properties) {
        return tables.create(schema, spec, properties, tablePath(identifier));
    }

    @Override
    public Transaction newCreateTableTransaction(TableIdentifier identifier, Schema schema, ParreplacedionSpec spec, String location, Map<String, String> properties) {
        throw new UnsupportedOperationException();
    }

    @Override
    public Transaction newReplaceTableTransaction(TableIdentifier identifier, Schema schema, ParreplacedionSpec spec, String location, Map<String, String> properties, boolean orCreate) {
        throw new UnsupportedOperationException();
    }

    @Override
    public boolean dropTable(TableIdentifier identifier, boolean purge) {
        return tables.dropTable(tablePath(identifier), purge);
    }

    @Override
    public void renameTable(TableIdentifier from, TableIdentifier to) {
        throw new UnsupportedOperationException();
    }

    @Override
    public Table loadTable(TableIdentifier identifier) {
        return tables.load(tablePath(identifier));
    }

    @Override
    public void initialize(String name, Map<String, String> properties) {
        String uri = properties.get(CatalogProperties.URI);
        warehouse = properties.get("warehouse");
        Preconditions.checkArgument(uri != null, "A uri parameter must be set");
        Preconditions.checkArgument(uri.contains("thrift"), "A ur parameter must be valid");
        Preconditions.checkArgument(warehouse != null, "A warehouse parameter must be set");
        this.tables = new HadoopTables(conf);
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public Configuration getConf() {
        return this.conf;
    }
}

19 Source : ConcurrencyTest.java
with Apache License 2.0
from apache

@Before
public void before() throws IOException {
    tableLocation = Files.createTempDirectory("temp").toFile();
    spark = SparkSession.builder().master("local[2]").getOrCreate();
    spark.sparkContext().setLogLevel("WARN");
    HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
    table = tables.create(schema, tableLocation.toString());
    for (int i = 0; i < 1000000; i++) {
        data.add(new SimpleRecord(1, "bdp"));
    }
    log.info("End of setup phase");
}

19 Source : VectorizedReadFlatParquetDataBenchmark.java
with Apache License 2.0
from apache

@Override
protected Table initTable() {
    Schema schema = new Schema(optional(1, "longCol", Types.LongType.get()), optional(2, "intCol", Types.IntegerType.get()), optional(3, "floatCol", Types.FloatType.get()), optional(4, "doubleCol", Types.DoubleType.get()), optional(5, "decimalCol", Types.DecimalType.of(20, 5)), optional(6, "dateCol", Types.DateType.get()), optional(7, "timestampCol", Types.TimestampType.withZone()), optional(8, "stringCol", Types.StringType.get()));
    ParreplacedionSpec parreplacedionSpec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables(hadoopConf());
    Map<String, String> properties = parquetWriteProps();
    return tables.create(schema, parreplacedionSpec, properties, newTableLocation());
}

19 Source : TestScanTaskSerialization.java
with Apache License 2.0
from apache

public abstract clreplaced TestScanTaskSerialization extends SparkTestBase {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    private static final Schema SCHEMA = new Schema(optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()));

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private String tableLocation = null;

    @Before
    public void setupTableLocation() throws Exception {
        File tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
    }

    @Test
    public void testBaseCombinedScanTaskKryoSerialization() throws Exception {
        BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest();
        File data = temp.newFile();
        replacedert.replacedertTrue(data.delete());
        Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
        try (Output out = new Output(new FileOutputStream(data))) {
            kryo.writeClreplacedAndObject(out, scanTask);
        }
        try (Input in = new Input(new FileInputStream(data))) {
            Object obj = kryo.readClreplacedAndObject(in);
            replacedert.replacedertTrue("Should be a BaseCombinedScanTask", obj instanceof BaseCombinedScanTask);
            TaskCheckHelper.replacedertEquals(scanTask, (BaseCombinedScanTask) obj);
        }
    }

    @Test
    public void testBaseCombinedScanTaskJavaSerialization() throws Exception {
        BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest();
        ByteArrayOutputStream bytes = new ByteArrayOutputStream();
        try (ObjectOutputStream out = new ObjectOutputStream(bytes)) {
            out.writeObject(scanTask);
        }
        try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) {
            Object obj = in.readObject();
            replacedert.replacedertTrue("Should be a BaseCombinedScanTask", obj instanceof BaseCombinedScanTask);
            TaskCheckHelper.replacedertEquals(scanTask, (BaseCombinedScanTask) obj);
        }
    }

    private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() {
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Map<String, String> options = Maps.newHashMap();
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
        writeRecords(records1);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
        writeRecords(records2);
        table.refresh();
        CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
        return new BaseCombinedScanTask(Lists.newArrayList(tasks));
    }

    private void writeRecords(List<ThreeColumnRecord> records) {
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        writeDF(df);
    }

    private void writeDF(Dataset<Row> df) {
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    }
}

19 Source : TestSparkDataFile.java
with Apache License 2.0
from apache

public abstract clreplaced TestSparkDataFile {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    private static final Schema SCHEMA = new Schema(required(100, "id", Types.LongType.get()), optional(101, "data", Types.StringType.get()), required(102, "b", Types.BooleanType.get()), optional(103, "i", Types.IntegerType.get()), required(104, "l", Types.LongType.get()), optional(105, "f", Types.FloatType.get()), required(106, "d", Types.DoubleType.get()), optional(107, "date", Types.DateType.get()), required(108, "ts", Types.TimestampType.withZone()), required(110, "s", Types.StringType.get()), optional(113, "bytes", Types.BinaryType.get()), required(114, "dec_9_0", Types.DecimalType.of(9, 0)), required(115, "dec_11_2", Types.DecimalType.of(11, 2)), // maximum precision
    required(116, "dec_38_10", Types.DecimalType.of(38, 10)));

    private static final ParreplacedionSpec SPEC = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("b").bucket("i", 2).idenreplacedy("l").idenreplacedy("f").idenreplacedy("d").idenreplacedy("date").hour("ts").idenreplacedy("ts").truncate("s", 2).idenreplacedy("bytes").bucket("dec_9_0", 2).bucket("dec_11_2", 2).bucket("dec_38_10", 2).build();

    private static SparkSession spark;

    private static JavaSparkContext sparkContext = null;

    @BeforeClreplaced
    public static void startSpark() {
        TestSparkDataFile.spark = SparkSession.builder().master("local[2]").getOrCreate();
        TestSparkDataFile.sparkContext = new JavaSparkContext(spark.sparkContext());
    }

    @AfterClreplaced
    public static void stopSpark() {
        SparkSession currentSpark = TestSparkDataFile.spark;
        TestSparkDataFile.spark = null;
        TestSparkDataFile.sparkContext = null;
        currentSpark.stop();
    }

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private String tableLocation = null;

    @Before
    public void setupTableLocation() throws Exception {
        File tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
    }

    @Test
    public void testValueConversion() throws IOException {
        Table table = TABLES.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), Maps.newHashMap(), tableLocation);
        checkSparkDataFile(table);
    }

    @Test
    public void testValueConversionParreplacedionedTable() throws IOException {
        Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
        checkSparkDataFile(table);
    }

    @Test
    public void testValueConversionWithEmptyStats() throws IOException {
        Map<String, String> props = Maps.newHashMap();
        props.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
        Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
        checkSparkDataFile(table);
    }

    private void checkSparkDataFile(Table table) throws IOException {
        Iterable<InternalRow> rows = RandomData.generateSpark(table.schema(), 200, 0);
        JavaRDD<InternalRow> rdd = sparkContext.parallelize(Lists.newArrayList(rows));
        Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false);
        df.write().format("iceberg").mode("append").save(tableLocation);
        table.refresh();
        List<ManifestFile> manifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 1 manifest", 1, manifests.size());
        List<DataFile> dataFiles = Lists.newArrayList();
        try (ManifestReader<DataFile> reader = ManifestFiles.read(manifests.get(0), table.io())) {
            reader.forEach(dataFile -> dataFiles.add(dataFile.copy()));
        }
        Dataset<Row> dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files");
        // reorder columns to test arbitrary projections
        List<Column> columns = Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList());
        Collections.shuffle(columns);
        List<Row> sparkDataFiles = dataFileDF.select(Iterables.toArray(columns, Column.clreplaced)).collectAsList();
        replacedert.replacedertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size());
        Types.StructType dataFileType = DataFile.getType(table.spec().parreplacedionType());
        StructType sparkDataFileType = sparkDataFiles.get(0).schema();
        SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkDataFileType);
        for (int i = 0; i < dataFiles.size(); i++) {
            checkDataFile(dataFiles.get(i), wrapper.wrap(sparkDataFiles.get(i)));
        }
    }

    private void checkDataFile(DataFile expected, DataFile actual) {
        replacedert.replacedertEquals("Path must match", expected.path(), actual.path());
        replacedert.replacedertEquals("Format must match", expected.format(), actual.format());
        replacedert.replacedertEquals("Record count must match", expected.recordCount(), actual.recordCount());
        replacedert.replacedertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes());
        replacedert.replacedertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts());
        replacedert.replacedertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts());
        replacedert.replacedertEquals("Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts());
        replacedert.replacedertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds());
        replacedert.replacedertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds());
        replacedert.replacedertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata());
        replacedert.replacedertEquals("Split offsets must match", expected.splitOffsets(), actual.splitOffsets());
        replacedert.replacedertEquals("Sort order id must match", expected.sortOrderId(), actual.sortOrderId());
        checkStructLike(expected.parreplacedion(), actual.parreplacedion());
    }

    private void checkStructLike(StructLike expected, StructLike actual) {
        replacedert.replacedertEquals("Struct size should match", expected.size(), actual.size());
        for (int i = 0; i < expected.size(); i++) {
            replacedert.replacedertEquals("Struct values must match", expected.get(i, Object.clreplaced), actual.get(i, Object.clreplaced));
        }
    }
}

19 Source : TestIcebergSourceHadoopTables.java
with Apache License 2.0
from apache

public abstract clreplaced TestIcebergSourceHadoopTables extends TestIcebergSourceTablesBase {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    File tableDir = null;

    String tableLocation = null;

    @Before
    public void setupTable() throws Exception {
        this.tableDir = temp.newFolder();
        // created by table create
        tableDir.delete();
        this.tableLocation = tableDir.toURI().toString();
    }

    @Override
    public Table createTable(TableIdentifier ident, Schema schema, ParreplacedionSpec spec) {
        if (spec.equals(ParreplacedionSpec.unparreplacedioned())) {
            return TABLES.create(schema, tableLocation);
        }
        return TABLES.create(schema, spec, tableLocation);
    }

    @Override
    public Table loadTable(TableIdentifier ident, String entriesSuffix) {
        return TABLES.load(loadLocation(ident, entriesSuffix));
    }

    @Override
    public String loadLocation(TableIdentifier ident, String entriesSuffix) {
        return String.format("%s#%s", loadLocation(ident), entriesSuffix);
    }

    @Override
    public String loadLocation(TableIdentifier ident) {
        return tableLocation;
    }
}

19 Source : TestRewriteDataFilesAction.java
with Apache License 2.0
from apache

public abstract clreplaced TestRewriteDataFilesAction extends SparkTestBase {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    private static final Schema SCHEMA = new Schema(optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()));

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private String tableLocation = null;

    @Before
    public void setupTableLocation() throws Exception {
        File tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
    }

    @Test
    public void testRewriteDataFilesEmptyTable() {
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Map<String, String> options = Maps.newHashMap();
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        replacedert.replacedertNull("Table must be empty", table.currentSnapshot());
        Actions actions = Actions.forTable(table);
        actions.rewriteDataFiles().execute();
        replacedert.replacedertNull("Table must stay empty", table.currentSnapshot());
    }

    @Test
    public void testRewriteDataFilesUnparreplacedionedTable() {
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Map<String, String> options = Maps.newHashMap();
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
        writeRecords(records1);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
        writeRecords(records2);
        table.refresh();
        CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
        List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
        replacedert.replacedertEquals("Should have 4 data files before rewrite", 4, dataFiles.size());
        Actions actions = Actions.forTable(table);
        RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
        replacedert.replacedertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size());
        replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
        table.refresh();
        CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
        List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
        replacedert.replacedertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size());
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records1);
        expectedRecords.addAll(records2);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    @Test
    public void testRewriteDataFilesParreplacedionedTable() {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("c1").truncate("c2", 2).build();
        Map<String, String> options = Maps.newHashMap();
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
        writeRecords(records1);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
        writeRecords(records2);
        List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
        writeRecords(records3);
        List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
        writeRecords(records4);
        table.refresh();
        CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
        List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
        replacedert.replacedertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
        Actions actions = Actions.forTable(table);
        RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
        replacedert.replacedertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
        replacedert.replacedertEquals("Action should add 4 data file", 4, result.addedDataFiles().size());
        table.refresh();
        CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
        List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
        replacedert.replacedertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size());
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records1);
        expectedRecords.addAll(records2);
        expectedRecords.addAll(records3);
        expectedRecords.addAll(records4);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    @Test
    public void testRewriteDataFilesWithFilter() {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("c1").truncate("c2", 2).build();
        Map<String, String> options = Maps.newHashMap();
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
        writeRecords(records1);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
        writeRecords(records2);
        List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
        writeRecords(records3);
        List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
        writeRecords(records4);
        table.refresh();
        CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
        List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
        replacedert.replacedertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
        Actions actions = Actions.forTable(table);
        RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c1", 1)).filter(Expressions.startsWith("c2", "AA")).execute();
        replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
        replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
        table.refresh();
        CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
        List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
        replacedert.replacedertEquals("Should have 7 data files before rewrite", 7, dataFiles1.size());
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records1);
        expectedRecords.addAll(records2);
        expectedRecords.addAll(records3);
        expectedRecords.addAll(records4);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    @Test
    public void testRewriteLargeTableHasResiduals() {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).build();
        Map<String, String> options = Maps.newHashMap();
        options.put(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100");
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        // all records belong to the same parreplacedion
        List<ThreeColumnRecord> records = Lists.newArrayList();
        for (int i = 0; i < 100; i++) {
            records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i % 4)));
        }
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        writeDF(df);
        table.refresh();
        CloseableIterable<FileScanTask> tasks = table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles();
        for (FileScanTask task : tasks) {
            replacedert.replacedertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
        }
        List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
        replacedert.replacedertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
        Actions actions = Actions.forTable(table);
        RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c3", "0")).execute();
        replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
        replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
        table.refresh();
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", records, actualRecords);
    }

    @Test
    public void testRewriteDataFilesForLargeFile() throws replacedysisException {
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Map<String, String> options = Maps.newHashMap();
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        replacedert.replacedertNull("Table must be empty", table.currentSnapshot());
        List<ThreeColumnRecord> records1 = Lists.newArrayList();
        IntStream.range(0, 2000).forEach(i -> records1.add(new ThreeColumnRecord(i, "foo" + i, "bar" + i)));
        Dataset<Row> df = spark.createDataFrame(records1, ThreeColumnRecord.clreplaced).reparreplacedion(1);
        writeDF(df);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "DDDDDDDDDD", "DDDD"));
        writeRecords(records2);
        table.refresh();
        CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
        List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
        DataFile maxSizeFile = Collections.max(dataFiles, Comparator.comparingLong(DataFile::fileSizeInBytes));
        replacedert.replacedertEquals("Should have 3 files before rewrite", 3, dataFiles.size());
        spark.read().format("iceberg").load(tableLocation).createTempView("origin");
        long originalNumRecords = spark.read().format("iceberg").load(tableLocation).count();
        List<Object[]> originalRecords = sql("SELECT * from origin sort by c2");
        Actions actions = Actions.forTable(table);
        long targetSizeInBytes = maxSizeFile.fileSizeInBytes() - 10;
        RewriteDataFilesActionResult result = actions.rewriteDataFiles().targetSizeInBytes(targetSizeInBytes).splitOpenFileCost(1).execute();
        replacedert.replacedertEquals("Action should delete 4 data files", 4, result.deletedDataFiles().size());
        replacedert.replacedertEquals("Action should add 2 data files", 2, result.addedDataFiles().size());
        spark.read().format("iceberg").load(tableLocation).createTempView("postRewrite");
        long postRewriteNumRecords = spark.read().format("iceberg").load(tableLocation).count();
        List<Object[]> rewrittenRecords = sql("SELECT * from postRewrite sort by c2");
        replacedert.replacedertEquals(originalNumRecords, postRewriteNumRecords);
        replacedertEquals("Rows should be unchanged", originalRecords, rewrittenRecords);
    }

    private void writeRecords(List<ThreeColumnRecord> records) {
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        writeDF(df);
    }

    private void writeDF(Dataset<Row> df) {
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    }
}

19 Source : TestRemoveOrphanFilesAction.java
with Apache License 2.0
from apache

public abstract clreplaced TestRemoveOrphanFilesAction extends SparkTestBase {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    protected static final Schema SCHEMA = new Schema(optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()));

    protected static final ParreplacedionSpec SPEC = ParreplacedionSpec.builderFor(SCHEMA).truncate("c2", 2).idenreplacedy("c3").build();

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private File tableDir = null;

    protected String tableLocation = null;

    @Before
    public void setupTableLocation() throws Exception {
        this.tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
    }

    @Test
    public void testDryRun() throws IOException, InterruptedException {
        Table table = TABLES.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), Maps.newHashMap(), tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced).coalesce(1);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        List<String> validFiles = spark.read().format("iceberg").load(tableLocation + "#files").select("file_path").as(Encoders.STRING()).collectAsList();
        replacedert.replacedertEquals("Should be 2 valid files", 2, validFiles.size());
        df.write().mode("append").parquet(tableLocation + "/data");
        Path dataPath = new Path(tableLocation + "/data");
        FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf());
        List<String> allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())).filter(FileStatus::isFile).map(file -> file.getPath().toString()).collect(Collectors.toList());
        replacedert.replacedertEquals("Should be 3 files", 3, allFiles.size());
        List<String> invalidFiles = Lists.newArrayList(allFiles);
        invalidFiles.removeAll(validFiles);
        replacedert.replacedertEquals("Should be 1 invalid file", 1, invalidFiles.size());
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result1 = actions.removeOrphanFiles().deleteWith(s -> {
        }).execute();
        replacedert.replacedertTrue("Default olderThan interval should be safe", result1.isEmpty());
        List<String> result2 = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).deleteWith(s -> {
        }).execute();
        replacedert.replacedertEquals("Action should find 1 file", invalidFiles, result2);
        replacedert.replacedertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0))));
        List<String> result3 = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertEquals("Action should delete 1 file", invalidFiles, result3);
        replacedert.replacedertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0))));
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records);
        expectedRecords.addAll(records);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    @Test
    public void testAllValidFilesAreKept() throws IOException, InterruptedException {
        Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df1 = spark.createDataFrame(records1, ThreeColumnRecord.clreplaced).coalesce(1);
        // original append
        df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df2 = spark.createDataFrame(records2, ThreeColumnRecord.clreplaced).coalesce(1);
        // dynamic parreplacedion overwrite
        df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation);
        // second append
        df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        List<Snapshot> snapshots = Lists.newArrayList(table.snapshots());
        List<String> snapshotFiles1 = snapshotFiles(snapshots.get(0).snapshotId());
        replacedert.replacedertEquals(1, snapshotFiles1.size());
        List<String> snapshotFiles2 = snapshotFiles(snapshots.get(1).snapshotId());
        replacedert.replacedertEquals(1, snapshotFiles2.size());
        List<String> snapshotFiles3 = snapshotFiles(snapshots.get(2).snapshotId());
        replacedert.replacedertEquals(2, snapshotFiles3.size());
        df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data");
        df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA");
        df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
        df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/invalid/invalid");
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertEquals("Should delete 4 files", 4, result.size());
        Path dataPath = new Path(tableLocation + "/data");
        FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf());
        for (String fileLocation : snapshotFiles1) {
            replacedert.replacedertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation)));
        }
        for (String fileLocation : snapshotFiles2) {
            replacedert.replacedertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation)));
        }
        for (String fileLocation : snapshotFiles3) {
            replacedert.replacedertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation)));
        }
    }

    @Test
    public void testWapFilesAreKept() throws InterruptedException {
        Map<String, String> props = Maps.newHashMap();
        props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true");
        Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        // normal write
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        spark.conf().set("spark.wap.id", "1");
        // wap write
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Should not return data from the staged snapshot", records, actualRecords);
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertTrue("Should not delete any files", result.isEmpty());
    }

    @Test
    public void testMetadataFolderIsIntact() throws InterruptedException {
        // write data directly to the table location
        Map<String, String> props = Maps.newHashMap();
        props.put(TableProperties.WRITE_NEW_DATA_LOCATION, tableLocation);
        Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced).coalesce(1);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA");
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertEquals("Should delete 1 file", 1, result.size());
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", records, actualRecords);
    }

    @Test
    public void testOlderThanTimestamp() throws InterruptedException {
        Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced).coalesce(1);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
        df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
        Thread.sleep(1000);
        long timestamp = System.currentTimeMillis();
        Thread.sleep(1000);
        df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(timestamp).execute();
        replacedert.replacedertEquals("Should delete only 2 files", 2, result.size());
    }

    @Test
    public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedException {
        Map<String, String> props = Maps.newHashMap();
        props.put(TableProperties.WRITE_NEW_DATA_LOCATION, tableLocation);
        props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1");
        Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertEquals("Should delete 1 file", 1, result.size());
        replacedert.replacedertTrue("Should remove v1 file", result.get(0).contains("v1.metadata.json"));
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records);
        expectedRecords.addAll(records);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    @Test
    public void testManyTopLevelParreplacedions() throws InterruptedException {
        Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList();
        for (int i = 0; i < 100; i++) {
            records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i)));
        }
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertTrue("Should not delete any files", result.isEmpty());
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", records, actualRecords);
    }

    @Test
    public void testManyLeafParreplacedions() throws InterruptedException {
        Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList();
        for (int i = 0; i < 100; i++) {
            records.add(new ThreeColumnRecord(i, String.valueOf(i % 3), String.valueOf(i)));
        }
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertTrue("Should not delete any files", result.isEmpty());
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", records, actualRecords);
    }

    private List<String> snapshotFiles(long snapshotId) {
        return spark.read().format("iceberg").option("snapshot-id", snapshotId).load(tableLocation + "#files").select("file_path").as(Encoders.STRING()).collectAsList();
    }

    @Test
    public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException {
        Table table = TABLES.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), Maps.newHashMap(), tableDir.getAbsolutePath());
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced).coalesce(1);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableDir.getAbsolutePath());
        List<String> validFiles = spark.read().format("iceberg").load(tableLocation + "#files").select("file_path").as(Encoders.STRING()).collectAsList();
        replacedert.replacedertEquals("Should be 1 valid files", 1, validFiles.size());
        String validFile = validFiles.get(0);
        df.write().mode("append").parquet(tableLocation + "/data");
        Path dataPath = new Path(tableLocation + "/data");
        FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf());
        List<String> allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())).filter(FileStatus::isFile).map(file -> file.getPath().toString()).collect(Collectors.toList());
        replacedert.replacedertEquals("Should be 2 files", 2, allFiles.size());
        List<String> invalidFiles = Lists.newArrayList(allFiles);
        invalidFiles.removeIf(file -> file.contains(validFile));
        replacedert.replacedertEquals("Should be 1 invalid file", 1, invalidFiles.size());
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        Actions actions = Actions.forTable(table);
        List<String> result = actions.removeOrphanFiles().olderThan(System.currentTimeMillis()).deleteWith(s -> {
        }).execute();
        replacedert.replacedertEquals("Action should find 1 file", invalidFiles, result);
        replacedert.replacedertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0))));
    }

    @Test
    public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException {
        HadoopCatalog catalog = new HadoopCatalog(new Configuration(), tableLocation);
        String namespaceName = "testDb";
        String tableName = "testTb";
        Namespace namespace = Namespace.of(namespaceName);
        TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName);
        Table table = catalog.createTable(tableIdentifier, SCHEMA, ParreplacedionSpec.unparreplacedioned(), Maps.newHashMap());
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced).coalesce(1);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location());
        df.write().mode("append").parquet(table.location() + "/data");
        // sleep for 1 second to unsure files will be old enough
        Thread.sleep(1000);
        table.refresh();
        List<String> result = Actions.forTable(table).removeOrphanFiles().olderThan(System.currentTimeMillis()).execute();
        replacedert.replacedertEquals("Should delete only 1 files", 1, result.size());
        Dataset<Row> resultDF = spark.read().format("iceberg").load(table.location());
        List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", records, actualRecords);
    }

    @Test
    public void testHiveCatalogTable() throws IOException {
        Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, Maps.newHashMap());
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced).coalesce(1);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save("default.hivetestorphan");
        String location = table.location().replaceFirst("file:", "");
        new File(location + "/data/trashfile").createNewFile();
        List<String> results = Actions.forTable(table).removeOrphanFiles().olderThan(System.currentTimeMillis() + 1000).execute();
        replacedert.replacedertTrue("trash file should be removed", results.contains("file:" + location + "data/trashfile"));
    }

    @Test
    public void testGarbageCollectionDisabled() {
        Table table = TABLES.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), Maps.newHashMap(), tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced).coalesce(1);
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
        table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit();
        Actions actions = Actions.forTable(table);
        replacedertHelpers.replacedertThrows("Should complain about removing orphan files", ValidationException.clreplaced, "Cannot remove orphan files: GC is disabled", actions::removeOrphanFiles);
    }
}

19 Source : TestHiveIcebergOutputCommitter.java
with Apache License 2.0
from apache

private Table table(String location, boolean parreplacedioned) {
    HadoopTables tables = new HadoopTables();
    return tables.create(CUSTOMER_SCHEMA, parreplacedioned ? PARreplacedIONED_SPEC : ParreplacedionSpec.unparreplacedioned(), location);
}

18 Source : TestSelect.java
with Apache License 2.0
from apache

public clreplaced TestSelect {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    private static final Schema SCHEMA = new Schema(optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()), optional(3, "doubleVal", Types.DoubleType.get()));

    private static SparkSession spark;

    private static int scanEventCount = 0;

    private static ScanEvent lastScanEvent = null;

    private Table table;

    static {
        Listeners.register(event -> {
            scanEventCount += 1;
            lastScanEvent = event;
        }, ScanEvent.clreplaced);
    }

    @BeforeClreplaced
    public static void startSpark() {
        spark = SparkSession.builder().master("local[2]").getOrCreate();
    }

    @AfterClreplaced
    public static void stopSpark() {
        SparkSession currentSpark = spark;
        spark = null;
        currentSpark.stop();
    }

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private String tableLocation = null;

    @Before
    public void init() throws Exception {
        File tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
        table = TABLES.create(SCHEMA, tableLocation);
        List<Record> rows = Lists.newArrayList(new Record(1, "a", 1.0), new Record(2, "b", 2.0), new Record(3, "c", Double.NaN));
        Dataset<Row> df = spark.createDataFrame(rows, Record.clreplaced);
        df.select("id", "data", "doubleVal").write().format("iceberg").mode("append").save(tableLocation);
        table.refresh();
        Dataset<Row> results = spark.read().format("iceberg").load(tableLocation);
        results.createOrReplaceTempView("table");
        scanEventCount = 0;
        lastScanEvent = null;
    }

    @Test
    public void testSelect() {
        List<Record> expected = ImmutableList.of(new Record(1, "a", 1.0), new Record(2, "b", 2.0), new Record(3, "c", Double.NaN));
        replacedert.replacedertEquals("Should return all expected rows", expected, sql("select * from table", Encoders.bean(Record.clreplaced)));
    }

    @Test
    public void testSelectRewrite() {
        List<Record> expected = ImmutableList.of(new Record(3, "c", Double.NaN));
        replacedert.replacedertEquals("Should return all expected rows", expected, sql("SELECT * FROM table where doubleVal = double('NaN')", Encoders.bean(Record.clreplaced)));
        replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
        Expression filter = lastScanEvent.filter();
        replacedert.replacedertEquals("Should create AND expression", Expression.Operation.AND, filter.op());
        Expression left = ((And) filter).left();
        Expression right = ((And) filter).right();
        replacedert.replacedertEquals("Left expression should be NOT_NULL", Expression.Operation.NOT_NULL, left.op());
        replacedert.replacedertTrue("Left expression should contain column name 'doubleVal'", left.toString().contains("doubleVal"));
        replacedert.replacedertEquals("Right expression should be IS_NAN", Expression.Operation.IS_NAN, right.op());
        replacedert.replacedertTrue("Right expression should contain column name 'doubleVal'", right.toString().contains("doubleVal"));
    }

    @Test
    public void testProjection() {
        List<Integer> expected = ImmutableList.of(1, 2, 3);
        replacedert.replacedertEquals("Should return all expected rows", expected, sql("SELECT id FROM table", Encoders.INT()));
        replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
        replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
        replacedert.replacedertEquals("Should project only the id column", table.schema().select("id").replacedtruct(), lastScanEvent.projection().replacedtruct());
    }

    @Test
    public void testExpressionPushdown() {
        List<String> expected = ImmutableList.of("b");
        replacedert.replacedertEquals("Should return all expected rows", expected, sql("SELECT data FROM table WHERE id = 2", Encoders.STRING()));
        replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
        replacedert.replacedertEquals("Should project only id and data columns", table.schema().select("id", "data").replacedtruct(), lastScanEvent.projection().replacedtruct());
    }

    private <T> List<T> sql(String str, Encoder<T> encoder) {
        return spark.sql(str).as(encoder).collectAsList();
    }

    public static clreplaced Record implements Serializable {

        private Integer id;

        private String data;

        private Double doubleVal;

        public Record() {
        }

        Record(Integer id, String data, Double doubleVal) {
            this.id = id;
            this.data = data;
            this.doubleVal = doubleVal;
        }

        public void setId(Integer id) {
            this.id = id;
        }

        public void setData(String data) {
            this.data = data;
        }

        public void setDoubleVal(Double doubleVal) {
            this.doubleVal = doubleVal;
        }

        public Integer getId() {
            return id;
        }

        public String getData() {
            return data;
        }

        public Double getDoubleVal() {
            return doubleVal;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || getClreplaced() != o.getClreplaced()) {
                return false;
            }
            Record record = (Record) o;
            return Objects.equal(id, record.id) && Objects.equal(data, record.data) && Objects.equal(doubleVal, record.doubleVal);
        }

        @Override
        public int hashCode() {
            return Objects.hashCode(id, data, doubleVal);
        }
    }
}

18 Source : TestCustomCatalog.java
with Apache License 2.0
from apache

public clreplaced TestCustomCatalog {

    private static final String CATALOG_IMPL = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, CustomCatalogs.ICEBERG_DEFAULT_CATALOG, CatalogProperties.CATALOG_IMPL);

    private static final String WAREHOUSE = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, CustomCatalogs.ICEBERG_DEFAULT_CATALOG, CatalogProperties.WAREHOUSE_LOCATION);

    private static final String URI_KEY = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, CustomCatalogs.ICEBERG_DEFAULT_CATALOG, CatalogProperties.URI);

    private static final String TEST_CATALOG = "placeholder_catalog";

    private static final String TEST_CATALOG_IMPL = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG, CatalogProperties.CATALOG_IMPL);

    private static final String TEST_WAREHOUSE = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG, CatalogProperties.WAREHOUSE_LOCATION);

    private static final String TEST_URI_KEY = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG, CatalogProperties.URI);

    // dummy uri
    private static final String URI_VAL = "thrift://localhost:12345";

    private static final String CATALOG_VAL = "org.apache.iceberg.spark.source.TestCatalog";

    private static final TableIdentifier TABLE = TableIdentifier.of("default", "table");

    private static final Schema SCHEMA = new Schema(optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()));

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    File tableDir = null;

    String tableLocation = null;

    HadoopTables tables;

    protected static SparkSession spark = null;

    @BeforeClreplaced
    public static void startMetastoreAndSpark() {
        spark = SparkSession.builder().master("local[2]").getOrCreate();
    }

    @AfterClreplaced
    public static void stopMetastoreAndSpark() {
        spark.stop();
        spark = null;
    }

    @Before
    public void setupTable() throws Exception {
        SparkConf sparkConf = spark.sparkContext().conf();
        sparkConf.set(String.format("%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, CustomCatalogs.ICEBERG_DEFAULT_CATALOG), "placeholder");
        sparkConf.set(String.format("%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG), "placeholder");
        this.tables = new HadoopTables(spark.sessionState().newHadoopConf());
        this.tableDir = temp.newFolder();
        // created by table create
        tableDir.delete();
        this.tableLocation = tableDir.toURI().toString();
        tables.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), String.format("%s/%s", tableLocation, TABLE.name()));
    }

    @After
    public void removeTable() {
        SparkConf sparkConf = spark.sparkContext().conf();
        sparkConf.remove(CATALOG_IMPL);
        sparkConf.remove(WAREHOUSE);
        sparkConf.remove(URI_KEY);
        tables.dropTable(String.format("%s/%s", tableLocation, TABLE.name()));
        tableDir.delete();
        CustomCatalogs.clearCache();
    }

    @Test
    public void withSparkOptions() {
        SparkConf sparkConf = spark.sparkContext().conf();
        sparkConf.set(CATALOG_IMPL, CATALOG_VAL);
        sparkConf.set(URI_KEY, URI_VAL);
        List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
        replacedertHelpers.replacedertThrows("We have not set all properties", IllegalArgumentException.clreplaced, "A warehouse parameter must be set", () -> df.select("id", "data").write().format("iceberg").mode("append").save(TABLE.toString()));
        sparkConf.set(WAREHOUSE, tableLocation);
        df.select("id", "data").write().format("iceberg").mode("append").save(TABLE.toString());
        List<SimpleRecord> dfNew = spark.read().format("iceberg").load(TABLE.toString()).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Data should match", expected, dfNew);
    }

    @Test
    public void withSparkCatalog() {
        String catalogTable = String.format("%s.%s", TEST_CATALOG, TABLE.toString());
        SparkConf sparkConf = spark.sparkContext().conf();
        sparkConf.set(TEST_CATALOG_IMPL, CATALOG_VAL);
        sparkConf.set(TEST_URI_KEY, URI_VAL);
        List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
        replacedertHelpers.replacedertThrows("We have not set all properties", IllegalArgumentException.clreplaced, "A warehouse parameter must be set", () -> df.select("id", "data").write().format("iceberg").mode("append").save(catalogTable));
        sparkConf.set(TEST_WAREHOUSE, tableLocation);
        df.select("id", "data").write().format("iceberg").mode("append").save(catalogTable);
        List<SimpleRecord> dfNew = spark.read().format("iceberg").load(catalogTable).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Data should match", expected, dfNew);
    }
}

18 Source : ReadAndWriteTablesTest.java
with Apache License 2.0
from apache

/**
 * This test clreplaced uses Spark to create parreplacedioned and unparreplacedioned tables locally.
 */
public clreplaced ReadAndWriteTablesTest {

    private SparkSession spark;

    private Table table;

    private HadoopTables tables;

    private File pathToTable;

    private Schema schema;

    @Before
    public void before() throws IOException {
        spark = SparkSession.builder().master("local[2]").getOrCreate();
        pathToTable = Files.createTempDirectory("temp").toFile();
        tables = new HadoopTables(spark.sessionState().newHadoopConf());
        schema = new Schema(optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()));
    }

    @Test
    public void createUnparreplacedionedTable() {
        table = tables.create(schema, pathToTable.toString());
        List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
        df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString());
        table.refresh();
    }

    @Test
    public void createParreplacedionedTable() {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(schema).idenreplacedy("id").build();
        table = tables.create(schema, spec, pathToTable.toString());
        List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
        Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
        df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString());
        table.refresh();
    }

    @Test
    public void writeDataFromJsonFile() {
        Schema bookSchema = new Schema(optional(1, "replacedle", Types.StringType.get()), optional(2, "price", Types.LongType.get()), optional(3, "author", Types.StringType.get()), optional(4, "published", Types.TimestampType.withZone()), optional(5, "genre", Types.StringType.get()));
        table = tables.create(bookSchema, pathToTable.toString());
        Dataset<Row> df = spark.read().json("src/test/resources/data/books.json");
        df.select(df.col("replacedle"), df.col("price"), df.col("author"), df.col("published").cast(DataTypes.TimestampType), df.col("genre")).write().format("iceberg").mode("append").save(pathToTable.toString());
        table.refresh();
    }

    @Test
    public void readFromIcebergTableWithSpark() {
        table = tables.create(schema, pathToTable.toString());
        Dataset<Row> results = spark.read().format("iceberg").load(pathToTable.toString());
        results.createOrReplaceTempView("table");
        spark.sql("select * from table").show();
    }

    @Test
    public void readFromParreplacedionedTableWithFilter() {
        table = tables.create(schema, pathToTable.toString());
        Dataset<Row> results = spark.read().format("iceberg").load(pathToTable.toString()).filter("data != \"b\"");
        results.createOrReplaceTempView("table");
        spark.sql("SELECT * FROM table").show();
    }

    @After
    public void after() throws IOException {
        FileUtils.deleteDirectory(pathToTable);
        spark.stop();
    }
}

18 Source : IcebergSourceFlatORCDataBenchmark.java
with Apache License 2.0
from apache

@Override
protected final Table initTable() {
    Schema schema = new Schema(required(1, "longCol", Types.LongType.get()), required(2, "intCol", Types.IntegerType.get()), required(3, "floatCol", Types.FloatType.get()), optional(4, "doubleCol", Types.DoubleType.get()), optional(5, "decimalCol", Types.DecimalType.of(20, 5)), optional(6, "dateCol", Types.DateType.get()), // Disable timestamp column for ORC performance tests as Spark native reader does not support ORC's
    // TIMESTAMP_INSTANT type
    // optional(7, "timestampCol", Types.TimestampType.withZone()),
    optional(8, "stringCol", Types.StringType.get()));
    ParreplacedionSpec parreplacedionSpec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables(hadoopConf());
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
    return tables.create(schema, parreplacedionSpec, properties, newTableLocation());
}

18 Source : IcebergSourceNestedListDataBenchmark.java
with Apache License 2.0
from apache

@Override
protected final Table initTable() {
    Schema schema = new Schema(required(0, "id", Types.LongType.get()), optional(1, "outerlist", Types.ListType.ofOptional(2, Types.StructType.of(required(3, "innerlist", Types.ListType.ofRequired(4, Types.StringType.get()))))));
    ParreplacedionSpec parreplacedionSpec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables(hadoopConf());
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
    return tables.create(schema, parreplacedionSpec, properties, newTableLocation());
}

18 Source : IcebergSourceNestedDataBenchmark.java
with Apache License 2.0
from apache

@Override
protected final Table initTable() {
    Schema schema = new Schema(required(0, "id", Types.LongType.get()), optional(4, "nested", Types.StructType.of(required(1, "col1", Types.StringType.get()), required(2, "col2", Types.DoubleType.get()), required(3, "col3", Types.LongType.get()))));
    ParreplacedionSpec parreplacedionSpec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables(hadoopConf());
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
    return tables.create(schema, parreplacedionSpec, properties, newTableLocation());
}

18 Source : IcebergSourceFlatDataBenchmark.java
with Apache License 2.0
from apache

@Override
protected final Table initTable() {
    Schema schema = new Schema(required(1, "longCol", Types.LongType.get()), required(2, "intCol", Types.IntegerType.get()), required(3, "floatCol", Types.FloatType.get()), optional(4, "doubleCol", Types.DoubleType.get()), optional(5, "decimalCol", Types.DecimalType.of(20, 5)), optional(6, "dateCol", Types.DateType.get()), optional(7, "timestampCol", Types.TimestampType.withZone()), optional(8, "stringCol", Types.StringType.get()));
    ParreplacedionSpec parreplacedionSpec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables(hadoopConf());
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
    return tables.create(schema, parreplacedionSpec, properties, newTableLocation());
}

18 Source : TestDataSourceOptions.java
with Apache License 2.0
from apache

@Test
public void testSplitOptionsOverridesTableProperties() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> options = Maps.newHashMap();
    // 128Mb
    options.put(TableProperties.SPLIT_SIZE, String.valueOf(128L * 1024 * 1024));
    Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    originalDf.select("id", "data").reparreplacedion(1).write().format("iceberg").mode("append").save(tableLocation);
    List<DataFile> files = Lists.newArrayList(icebergTable.currentSnapshot().addedFiles());
    replacedert.replacedertEquals("Should have written 1 file", 1, files.size());
    long fileSize = files.get(0).fileSizeInBytes();
    long splitSize = LongMath.divide(fileSize, 2, RoundingMode.CEILING);
    Dataset<Row> resultDf = spark.read().format("iceberg").option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)).load(tableLocation);
    replacedert.replacedertEquals("Spark parreplacedions should match", 2, resultDf.javaRDD().getNumParreplacedions());
}

18 Source : TestDataSourceOptions.java
with Apache License 2.0
from apache

@Test
public void testDefaultMetadataSplitSize() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> options = Maps.newHashMap();
    tables.create(SCHEMA, spec, options, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    // 32MB split size
    int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT;
    int expectedSplits = ((int) tables.load(tableLocation + "#entries").currentSnapshot().allManifests().get(0).length() + splitSize - 1) / splitSize;
    Dataset<Row> metadataDf = spark.read().format("iceberg").load(tableLocation + "#entries");
    int parreplacedionNum = metadataDf.javaRDD().getNumParreplacedions();
    replacedert.replacedertEquals("Spark parreplacedions should match", expectedSplits, parreplacedionNum);
}

18 Source : TestDataSourceOptions.java
with Apache License 2.0
from apache

@Test
public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> options = Maps.newHashMap();
    Table table = tables.create(SCHEMA, spec, options, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    // produce 1st manifest
    originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    // produce 2nd manifest
    originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    List<ManifestFile> manifests = table.currentSnapshot().allManifests();
    replacedert.replacedertEquals("Must be 2 manifests", 2, manifests.size());
    // set the target metadata split size so each manifest ends up in a separate split
    table.updateProperties().set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())).commit();
    Dataset<Row> entriesDf = spark.read().format("iceberg").load(tableLocation + "#entries");
    replacedert.replacedertEquals("Num parreplacedions must match", 2, entriesDf.javaRDD().getNumParreplacedions());
    // override the table property using options
    entriesDf = spark.read().format("iceberg").option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)).load(tableLocation + "#entries");
    replacedert.replacedertEquals("Num parreplacedions must match", 1, entriesDf.javaRDD().getNumParreplacedions());
}

18 Source : TestDataSourceOptions.java
with Apache License 2.0
from apache

@Test
public void testExtraSnapshotMetadata() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    tables.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), Maps.newHashMap(), tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    originalDf.select("id", "data").write().format("iceberg").mode("append").option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue").option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue").save(tableLocation);
    Table table = tables.load(tableLocation);
    replacedert.replacedertTrue(table.currentSnapshot().summary().get("extra-key").equals("someValue"));
    replacedert.replacedertTrue(table.currentSnapshot().summary().get("another-key").equals("anotherValue"));
}

18 Source : TestDataFrameWrites.java
with Apache License 2.0
from apache

private Table createTable(Schema schema, File location) {
    HadoopTables tables = new HadoopTables(CONF);
    return tables.create(schema, ParreplacedionSpec.unparreplacedioned(), location.toString());
}

18 Source : TestRewriteManifestsAction.java
with Apache License 2.0
from apache

@RunWith(Parameterized.clreplaced)
public abstract clreplaced TestRewriteManifestsAction extends SparkTestBase {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    private static final Schema SCHEMA = new Schema(optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()));

    @Parameterized.Parameters(name = "snapshotIdInheritanceEnabled = {0}")
    public static Object[] parameters() {
        return new Object[] { "true", "false" };
    }

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private final String snapshotIdInheritanceEnabled;

    private String tableLocation = null;

    public TestRewriteManifestsAction(String snapshotIdInheritanceEnabled) {
        this.snapshotIdInheritanceEnabled = snapshotIdInheritanceEnabled;
    }

    @Before
    public void setupTableLocation() throws Exception {
        File tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
    }

    @Test
    public void testRewriteManifestsEmptyTable() throws IOException {
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Map<String, String> options = Maps.newHashMap();
        options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        replacedert.replacedertNull("Table must be empty", table.currentSnapshot());
        Actions actions = Actions.forTable(table);
        actions.rewriteManifests().rewriteIf(manifest -> true).stagingLocation(temp.newFolder().toString()).execute();
        replacedert.replacedertNull("Table must stay empty", table.currentSnapshot());
    }

    @Test
    public void testRewriteSmallManifestsNonParreplacedionedTable() {
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Map<String, String> options = Maps.newHashMap();
        options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
        writeRecords(records1);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
        writeRecords(records2);
        table.refresh();
        List<ManifestFile> manifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 2 manifests before rewrite", 2, manifests.size());
        Actions actions = Actions.forTable(table);
        RewriteManifestsActionResult result = actions.rewriteManifests().rewriteIf(manifest -> true).execute();
        replacedert.replacedertEquals("Action should rewrite 2 manifests", 2, result.deletedManifests().size());
        replacedert.replacedertEquals("Action should add 1 manifests", 1, result.addedManifests().size());
        table.refresh();
        List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 1 manifests after rewrite", 1, newManifests.size());
        replacedert.replacedertEquals(4, (long) newManifests.get(0).existingFilesCount());
        replacedert.replacedertFalse(newManifests.get(0).hasAddedFiles());
        replacedert.replacedertFalse(newManifests.get(0).hasDeletedFiles());
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records1);
        expectedRecords.addAll(records2);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    @Test
    public void testRewriteSmallManifestsParreplacedionedTable() {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("c1").truncate("c2", 2).build();
        Map<String, String> options = Maps.newHashMap();
        options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
        writeRecords(records1);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
        writeRecords(records2);
        List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF"));
        writeRecords(records3);
        List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH"));
        writeRecords(records4);
        table.refresh();
        List<ManifestFile> manifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 4 manifests before rewrite", 4, manifests.size());
        Actions actions = Actions.forTable(table);
        // we will expect to have 2 manifests with 4 entries in each after rewrite
        long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests);
        long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes);
        table.updateProperties().set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)).commit();
        RewriteManifestsActionResult result = actions.rewriteManifests().rewriteIf(manifest -> true).execute();
        replacedert.replacedertEquals("Action should rewrite 4 manifests", 4, result.deletedManifests().size());
        replacedert.replacedertEquals("Action should add 2 manifests", 2, result.addedManifests().size());
        table.refresh();
        List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
        replacedert.replacedertEquals(4, (long) newManifests.get(0).existingFilesCount());
        replacedert.replacedertFalse(newManifests.get(0).hasAddedFiles());
        replacedert.replacedertFalse(newManifests.get(0).hasDeletedFiles());
        replacedert.replacedertEquals(4, (long) newManifests.get(1).existingFilesCount());
        replacedert.replacedertFalse(newManifests.get(1).hasAddedFiles());
        replacedert.replacedertFalse(newManifests.get(1).hasDeletedFiles());
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records1);
        expectedRecords.addAll(records2);
        expectedRecords.addAll(records3);
        expectedRecords.addAll(records4);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    @Test
    public void testRewriteImportedManifests() throws IOException {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("c3").build();
        Map<String, String> options = Maps.newHashMap();
        options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
        File parquetTableDir = temp.newFolder("parquet_table");
        String parquetTableLocation = parquetTableDir.toURI().toString();
        try {
            Dataset<Row> inputDF = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
            inputDF.select("c1", "c2", "c3").write().format("parquet").mode("overwrite").option("path", parquetTableLocation).parreplacedionBy("c3").saveAsTable("parquet_table");
            File stagingDir = temp.newFolder("staging-dir");
            SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
            Snapshot snapshot = table.currentSnapshot();
            Actions actions = Actions.forTable(table);
            RewriteManifestsActionResult result = actions.rewriteManifests().rewriteIf(manifest -> true).stagingLocation(temp.newFolder().toString()).execute();
            replacedert.replacedertEquals("Action should rewrite all manifests", snapshot.allManifests(), result.deletedManifests());
            replacedert.replacedertEquals("Action should add 1 manifest", 1, result.addedManifests().size());
        } finally {
            spark.sql("DROP TABLE parquet_table");
        }
    }

    @Test
    public void testRewriteLargeManifestsParreplacedionedTable() throws IOException {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("c3").build();
        Map<String, String> options = Maps.newHashMap();
        options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        // all records belong to the same parreplacedion
        List<ThreeColumnRecord> records = Lists.newArrayList();
        for (int i = 0; i < 50; i++) {
            records.add(new ThreeColumnRecord(i, String.valueOf(i), "0"));
        }
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        // reparreplacedion to create separate files
        writeDF(df.reparreplacedion(50, df.col("c1")));
        table.refresh();
        List<ManifestFile> manifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 1 manifests before rewrite", 1, manifests.size());
        // set the target manifest size to a small value to force splitting records into multiple files
        table.updateProperties().set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)).commit();
        Actions actions = Actions.forTable(table);
        RewriteManifestsActionResult result = actions.rewriteManifests().rewriteIf(manifest -> true).stagingLocation(temp.newFolder().toString()).execute();
        replacedert.replacedertEquals("Action should rewrite 1 manifest", 1, result.deletedManifests().size());
        replacedert.replacedertEquals("Action should add 2 manifests", 2, result.addedManifests().size());
        table.refresh();
        List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", records, actualRecords);
    }

    @Test
    public void testRewriteManifestsWithPredicate() throws IOException {
        ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("c1").truncate("c2", 2).build();
        Map<String, String> options = Maps.newHashMap();
        options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
        Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
        List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
        writeRecords(records1);
        List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
        writeRecords(records2);
        table.refresh();
        List<ManifestFile> manifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 2 manifests before rewrite", 2, manifests.size());
        Actions actions = Actions.forTable(table);
        // rewrite only the first manifest without caching
        RewriteManifestsActionResult result = actions.rewriteManifests().rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())).stagingLocation(temp.newFolder().toString()).useCaching(false).execute();
        replacedert.replacedertEquals("Action should rewrite 1 manifest", 1, result.deletedManifests().size());
        replacedert.replacedertEquals("Action should add 1 manifests", 1, result.addedManifests().size());
        table.refresh();
        List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
        replacedert.replacedertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
        replacedert.replacedertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0)));
        replacedert.replacedertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1)));
        List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
        expectedRecords.addAll(records1);
        expectedRecords.addAll(records2);
        Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
        List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Rows must match", expectedRecords, actualRecords);
    }

    private void writeRecords(List<ThreeColumnRecord> records) {
        Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
        writeDF(df);
    }

    private void writeDF(Dataset<Row> df) {
        df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    }

    private long computeManifestEntrySizeBytes(List<ManifestFile> manifests) {
        long totalSize = 0L;
        int numEntries = 0;
        for (ManifestFile manifest : manifests) {
            totalSize += manifest.length();
            numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount();
        }
        return totalSize / numEntries;
    }
}

18 Source : TestExpireSnapshotsAction.java
with Apache License 2.0
from apache

public abstract clreplaced TestExpireSnapshotsAction extends SparkTestBase {

    private static final HadoopTables TABLES = new HadoopTables(new Configuration());

    private static final Schema SCHEMA = new Schema(optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()));

    private static final int SHUFFLE_PARreplacedIONS = 2;

    private static final ParreplacedionSpec SPEC = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("c1").build();

    static final DataFile FILE_A = DataFiles.builder(SPEC).withPath("/path/to/data-a.parquet").withFileSizeInBytes(10).withParreplacedionPath(// easy way to set parreplacedion data for now
    "c1=0").withRecordCount(1).build();

    static final DataFile FILE_B = DataFiles.builder(SPEC).withPath("/path/to/data-b.parquet").withFileSizeInBytes(10).withParreplacedionPath(// easy way to set parreplacedion data for now
    "c1=1").withRecordCount(1).build();

    static final DataFile FILE_C = DataFiles.builder(SPEC).withPath("/path/to/data-c.parquet").withFileSizeInBytes(10).withParreplacedionPath(// easy way to set parreplacedion data for now
    "c1=2").withRecordCount(1).build();

    static final DataFile FILE_D = DataFiles.builder(SPEC).withPath("/path/to/data-d.parquet").withFileSizeInBytes(10).withParreplacedionPath(// easy way to set parreplacedion data for now
    "c1=3").withRecordCount(1).build();

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private File tableDir;

    private String tableLocation;

    private Table table;

    @Before
    public void setupTableLocation() throws Exception {
        this.tableDir = temp.newFolder();
        this.tableLocation = tableDir.toURI().toString();
        this.table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
        spark.conf().set("spark.sql.shuffle.parreplacedions", SHUFFLE_PARreplacedIONS);
    }

    private Long rightAfterSnapshot() {
        return rightAfterSnapshot(table.currentSnapshot().snapshotId());
    }

    private Long rightAfterSnapshot(long snapshotId) {
        Long end = System.currentTimeMillis();
        while (end <= table.snapshot(snapshotId).timestampMillis()) {
            end = System.currentTimeMillis();
        }
        return end;
    }

    private void checkExpirationResults(long expectedDatafiles, long expectedManifestsDeleted, long expectedManifestListsDeleted, ExpireSnapshotsActionResult results) {
        replacedert.replacedertEquals("Incorrect number of manifest files deleted", (Long) expectedManifestsDeleted, results.manifestFilesDeleted());
        replacedert.replacedertEquals("Incorrect number of datafiles deleted", (Long) expectedDatafiles, results.dataFilesDeleted());
        replacedert.replacedertEquals("Incorrect number of manifest lists deleted", (Long) expectedManifestListsDeleted, results.manifestListsDeleted());
    }

    @Test
    public void testFilesCleaned() throws Exception {
        table.newFastAppend().appendFile(FILE_A).commit();
        table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit();
        table.newFastAppend().appendFile(FILE_C).commit();
        long end = rightAfterSnapshot();
        ExpireSnapshotsActionResult results = Actions.forTable(table).expireSnapshots().expireOlderThan(end).execute();
        replacedert.replacedertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots()));
        checkExpirationResults(1L, 1L, 2L, results);
    }

    @Test
    public void dataFilesCleanupWithParallelTasks() throws IOException {
        table.newFastAppend().appendFile(FILE_A).commit();
        table.newFastAppend().appendFile(FILE_B).commit();
        table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit();
        table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit();
        long t4 = rightAfterSnapshot();
        Set<String> deletedFiles = Sets.newHashSet();
        Set<String> deleteThreads = ConcurrentHashMap.newKeySet();
        AtomicInteger deleteThreadsIndex = new AtomicInteger(0);
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> {
            Thread thread = new Thread(runnable);
            thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement());
            // daemon threads will be terminated abruptly when the JVM exits
            thread.setDaemon(true);
            return thread;
        })).expireOlderThan(t4).deleteWith(s -> {
            deleteThreads.add(Thread.currentThread().getName());
            deletedFiles.add(s);
        }).execute();
        // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory
        replacedert.replacedertEquals(deleteThreads, Sets.newHashSet("remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3"));
        replacedert.replacedertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()));
        replacedert.replacedertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString()));
        checkExpirationResults(2L, 3L, 3L, result);
    }

    @Test
    public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception {
        table.newFastAppend().appendFile(FILE_A).commit();
        ExpireSnapshotsActionResult results = Actions.forTable(table).expireSnapshots().execute();
        checkExpirationResults(0L, 0L, 0L, results);
    }

    @Test
    public void testCleanupRepeatedOverwrites() throws Exception {
        table.newFastAppend().appendFile(FILE_A).commit();
        for (int i = 0; i < 10; i++) {
            table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit();
            table.newOverwrite().deleteFile(FILE_B).addFile(FILE_A).commit();
        }
        long end = rightAfterSnapshot();
        ExpireSnapshotsActionResult results = Actions.forTable(table).expireSnapshots().expireOlderThan(end).execute();
        checkExpirationResults(1L, 39L, 20L, results);
    }

    @Test
    public void testRetainLastWithExpireOlderThan() {
        table.newAppend().appendFile(// data_bucket=0
        FILE_A).commit();
        long firstSnapshotId = table.currentSnapshot().snapshotId();
        long t1 = System.currentTimeMillis();
        while (t1 <= table.currentSnapshot().timestampMillis()) {
            t1 = System.currentTimeMillis();
        }
        table.newAppend().appendFile(// data_bucket=1
        FILE_B).commit();
        table.newAppend().appendFile(// data_bucket=2
        FILE_C).commit();
        long t3 = rightAfterSnapshot();
        // Retain last 2 snapshots
        Actions.forTable(table).expireSnapshots().expireOlderThan(t3).retainLast(2).execute();
        replacedert.replacedertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size());
        replacedert.replacedertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId));
    }

    @Test
    public void testExpireTwoSnapshotsById() throws Exception {
        table.newAppend().appendFile(// data_bucket=0
        FILE_A).commit();
        long firstSnapshotId = table.currentSnapshot().snapshotId();
        table.newAppend().appendFile(// data_bucket=1
        FILE_B).commit();
        long secondSnapshotID = table.currentSnapshot().snapshotId();
        table.newAppend().appendFile(// data_bucket=2
        FILE_C).commit();
        // Retain last 2 snapshots
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireSnapshotId(firstSnapshotId).expireSnapshotId(secondSnapshotID).execute();
        replacedert.replacedertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size());
        replacedert.replacedertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId));
        replacedert.replacedertEquals("Second snapshot should not be present.", null, table.snapshot(secondSnapshotID));
        checkExpirationResults(0L, 0L, 2L, result);
    }

    @Test
    public void testRetainLastWithExpireById() {
        table.newAppend().appendFile(// data_bucket=0
        FILE_A).commit();
        long firstSnapshotId = table.currentSnapshot().snapshotId();
        table.newAppend().appendFile(// data_bucket=1
        FILE_B).commit();
        table.newAppend().appendFile(// data_bucket=2
        FILE_C).commit();
        // Retain last 3 snapshots, but explicitly remove the first snapshot
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireSnapshotId(firstSnapshotId).retainLast(3).execute();
        replacedert.replacedertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size());
        replacedert.replacedertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId));
        checkExpirationResults(0L, 0L, 1L, result);
    }

    @Test
    public void testRetainLastWithTooFewSnapshots() {
        table.newAppend().appendFile(// data_bucket=0
        FILE_A).appendFile(// data_bucket=1
        FILE_B).commit();
        long firstSnapshotId = table.currentSnapshot().snapshotId();
        table.newAppend().appendFile(// data_bucket=2
        FILE_C).commit();
        long t2 = rightAfterSnapshot();
        // Retain last 3 snapshots
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(t2).retainLast(3).execute();
        replacedert.replacedertEquals("Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size());
        replacedert.replacedertEquals("First snapshot should still present", firstSnapshotId, table.snapshot(firstSnapshotId).snapshotId());
        checkExpirationResults(0L, 0L, 0L, result);
    }

    @Test
    public void testRetainLastKeepsExpiringSnapshot() {
        table.newAppend().appendFile(// data_bucket=0
        FILE_A).commit();
        table.newAppend().appendFile(// data_bucket=1
        FILE_B).commit();
        Snapshot secondSnapshot = table.currentSnapshot();
        table.newAppend().appendFile(// data_bucket=2
        FILE_C).commit();
        table.newAppend().appendFile(// data_bucket=3
        FILE_D).commit();
        // Retain last 2 snapshots and expire older than t3
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(secondSnapshot.timestampMillis()).retainLast(2).execute();
        replacedert.replacedertEquals("Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size());
        replacedert.replacedertNotNull("Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId()));
        checkExpirationResults(0L, 0L, 1L, result);
    }

    @Test
    public void testExpireSnapshotsWithDisabledGarbageCollection() {
        table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit();
        table.newAppend().appendFile(FILE_A).commit();
        Actions actions = Actions.forTable(table);
        replacedertHelpers.replacedertThrows("Should complain about expiring snapshots", ValidationException.clreplaced, "Cannot expire snapshots: GC is disabled", actions::expireSnapshots);
    }

    @Test
    public void testExpireOlderThanMultipleCalls() {
        table.newAppend().appendFile(// data_bucket=0
        FILE_A).commit();
        table.newAppend().appendFile(// data_bucket=1
        FILE_B).commit();
        Snapshot secondSnapshot = table.currentSnapshot();
        table.newAppend().appendFile(// data_bucket=2
        FILE_C).commit();
        Snapshot thirdSnapshot = table.currentSnapshot();
        // Retain last 2 snapshots and expire older than t3
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(secondSnapshot.timestampMillis()).expireOlderThan(thirdSnapshot.timestampMillis()).execute();
        replacedert.replacedertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size());
        replacedert.replacedertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId()));
        checkExpirationResults(0L, 0L, 2L, result);
    }

    @Test
    public void testRetainLastMultipleCalls() {
        table.newAppend().appendFile(// data_bucket=0
        FILE_A).commit();
        table.newAppend().appendFile(// data_bucket=1
        FILE_B).commit();
        Snapshot secondSnapshot = table.currentSnapshot();
        table.newAppend().appendFile(// data_bucket=2
        FILE_C).commit();
        long t3 = rightAfterSnapshot();
        // Retain last 2 snapshots and expire older than t3
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(t3).retainLast(2).retainLast(1).execute();
        replacedert.replacedertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size());
        replacedert.replacedertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId()));
        checkExpirationResults(0L, 0L, 2L, result);
    }

    @Test
    public void testRetainZeroSnapshots() {
        replacedertHelpers.replacedertThrows("Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", IllegalArgumentException.clreplaced, "Number of snapshots to retain must be at least 1, cannot be: 0", () -> Actions.forTable(table).expireSnapshots().retainLast(0).execute());
    }

    @Test
    public void testScanExpiredManifestInValidSnapshotAppend() {
        table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit();
        table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit();
        table.newAppend().appendFile(FILE_D).commit();
        long t3 = rightAfterSnapshot();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(t3).deleteWith(deletedFiles::add).execute();
        replacedert.replacedertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()));
        checkExpirationResults(1L, 1L, 2L, result);
    }

    @Test
    public void testScanExpiredManifestInValidSnapshotFastAppend() {
        table.updateProperties().set(TableProperties.MANIFEST_MERGE_ENABLED, "true").set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1").commit();
        table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit();
        table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit();
        table.newFastAppend().appendFile(FILE_D).commit();
        long t3 = rightAfterSnapshot();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(t3).deleteWith(deletedFiles::add).execute();
        replacedert.replacedertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()));
        checkExpirationResults(1L, 1L, 2L, result);
    }

    /**
     * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API.
     * Table: A - C
     *          ` B (staged)
     */
    @Test
    public void testWithExpiringDanglingStageCommit() {
        // `A` commit
        table.newAppend().appendFile(FILE_A).commit();
        // `B` staged commit
        table.newAppend().appendFile(FILE_B).stageOnly().commit();
        TableMetadata base = ((BaseTable) table).operations().current();
        Snapshot snapshotA = base.snapshots().get(0);
        Snapshot snapshotB = base.snapshots().get(1);
        // `C` commit
        table.newAppend().appendFile(FILE_C).commit();
        Set<String> deletedFiles = new HashSet<>();
        // Expire all commits including dangling staged snapshot.
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().deleteWith(deletedFiles::add).expireOlderThan(snapshotB.timestampMillis() + 1).execute();
        checkExpirationResults(1L, 1L, 2L, result);
        Set<String> expectedDeletes = new HashSet<>();
        expectedDeletes.add(snapshotA.manifestListLocation());
        // Files should be deleted of dangling staged snapshot
        snapshotB.addedFiles().forEach(i -> {
            expectedDeletes.add(i.path().toString());
        });
        // ManifestList should be deleted too
        expectedDeletes.add(snapshotB.manifestListLocation());
        snapshotB.dataManifests().forEach(file -> {
            // Only the manifest of B should be deleted.
            if (file.snapshotId() == snapshotB.snapshotId()) {
                expectedDeletes.add(file.path());
            }
        });
        replacedert.replacedertSame("Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size());
        // Take the diff
        expectedDeletes.removeAll(deletedFiles);
        replacedert.replacedertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty());
    }

    /**
     * Expire cherry-pick the commit as shown below, when `B` is in table's current state
     *  Table:
     *  A - B - C <--current snapshot
     *   `- D (source=B)
     */
    @Test
    public void testWithCherryPickTableSnapshot() {
        // `A` commit
        table.newAppend().appendFile(FILE_A).commit();
        Snapshot snapshotA = table.currentSnapshot();
        // `B` commit
        Set<String> deletedAFiles = new HashSet<>();
        table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit();
        replacedert.replacedertTrue("No files should be physically deleted", deletedAFiles.isEmpty());
        // pick the snapshot 'B`
        Snapshot snapshotB = table.currentSnapshot();
        // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick
        table.newAppend().appendFile(FILE_C).commit();
        Snapshot snapshotC = table.currentSnapshot();
        // Move the table back to `A`
        table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit();
        // Generate A -> `D (B)`
        table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit();
        Snapshot snapshotD = table.currentSnapshot();
        // Move the table back to `C`
        table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit();
        List<String> deletedFiles = new ArrayList<>();
        // Expire `C`
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().deleteWith(deletedFiles::add).expireOlderThan(snapshotC.timestampMillis() + 1).execute();
        // Make sure no dataFiles are deleted for the B, C, D snapshot
        Lists.newArrayList(snapshotB, snapshotC, snapshotD).forEach(i -> {
            i.addedFiles().forEach(item -> {
                replacedert.replacedertFalse(deletedFiles.contains(item.path().toString()));
            });
        });
        checkExpirationResults(1L, 2L, 2L, result);
    }

    /**
     * Test on table below, and expiring `B` which is not in current table state.
     *  1) Expire `B`
     *  2) All commit
     * Table: A - C - D (B)
     *          ` B (staged)
     */
    @Test
    public void testWithExpiringStagedThenCherrypick() {
        // `A` commit
        table.newAppend().appendFile(FILE_A).commit();
        // `B` commit
        table.newAppend().appendFile(FILE_B).stageOnly().commit();
        // pick the snapshot that's staged but not committed
        TableMetadata base = ((BaseTable) table).operations().current();
        Snapshot snapshotB = base.snapshots().get(1);
        // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick
        table.newAppend().appendFile(FILE_C).commit();
        // `D (B)` cherry-pick commit
        table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit();
        base = ((BaseTable) table).operations().current();
        Snapshot snapshotD = base.snapshots().get(3);
        List<String> deletedFiles = new ArrayList<>();
        // Expire `B` commit.
        ExpireSnapshotsActionResult firstResult = Actions.forTable(table).expireSnapshots().deleteWith(deletedFiles::add).expireSnapshotId(snapshotB.snapshotId()).execute();
        // Make sure no dataFiles are deleted for the staged snapshot
        Lists.newArrayList(snapshotB).forEach(i -> {
            i.addedFiles().forEach(item -> {
                replacedert.replacedertFalse(deletedFiles.contains(item.path().toString()));
            });
        });
        checkExpirationResults(0L, 1L, 1L, firstResult);
        // Expire all snapshots including cherry-pick
        ExpireSnapshotsActionResult secondResult = Actions.forTable(table).expireSnapshots().deleteWith(deletedFiles::add).expireOlderThan(table.currentSnapshot().timestampMillis() + 1).execute();
        // Make sure no dataFiles are deleted for the staged and cherry-pick
        Lists.newArrayList(snapshotB, snapshotD).forEach(i -> {
            i.addedFiles().forEach(item -> {
                replacedert.replacedertFalse(deletedFiles.contains(item.path().toString()));
            });
        });
        checkExpirationResults(0L, 0L, 2L, secondResult);
    }

    @Test
    public void testExpireOlderThan() {
        table.newAppend().appendFile(FILE_A).commit();
        Snapshot firstSnapshot = table.currentSnapshot();
        rightAfterSnapshot();
        table.newAppend().appendFile(FILE_B).commit();
        long snapshotId = table.currentSnapshot().snapshotId();
        long tAfterCommits = rightAfterSnapshot();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).execute();
        replacedert.replacedertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId());
        replacedert.replacedertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId()));
        replacedert.replacedertEquals("Should remove only the expired manifest list location", Sets.newHashSet(firstSnapshot.manifestListLocation()), deletedFiles);
        checkExpirationResults(0, 0, 1, result);
    }

    @Test
    public void testExpireOlderThanWithDelete() {
        table.newAppend().appendFile(FILE_A).commit();
        Snapshot firstSnapshot = table.currentSnapshot();
        replacedert.replacedertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size());
        rightAfterSnapshot();
        table.newDelete().deleteFile(FILE_A).commit();
        Snapshot secondSnapshot = table.currentSnapshot();
        replacedert.replacedertEquals("Should create replace manifest with a rewritten manifest", 1, secondSnapshot.allManifests().size());
        table.newAppend().appendFile(FILE_B).commit();
        rightAfterSnapshot();
        long snapshotId = table.currentSnapshot().snapshotId();
        long tAfterCommits = rightAfterSnapshot();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).execute();
        replacedert.replacedertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId());
        replacedert.replacedertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId()));
        replacedert.replacedertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId()));
        replacedert.replacedertEquals("Should remove expired manifest lists and deleted data file", Sets.newHashSet(// snapshot expired
        firstSnapshot.manifestListLocation(), // manifest was rewritten for delete
        firstSnapshot.allManifests().get(0).path(), // snapshot expired
        secondSnapshot.manifestListLocation(), // manifest contained only deletes, was dropped
        secondSnapshot.allManifests().get(0).path(), // deleted
        FILE_A.path()), deletedFiles);
        checkExpirationResults(1, 2, 2, result);
    }

    @Test
    public void testExpireOlderThanWithDeleteInMergedManifests() {
        // merge every commit
        table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit();
        table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit();
        Snapshot firstSnapshot = table.currentSnapshot();
        replacedert.replacedertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size());
        rightAfterSnapshot();
        table.newDelete().deleteFile(// FILE_B is still in the dataset
        FILE_A).commit();
        Snapshot secondSnapshot = table.currentSnapshot();
        replacedert.replacedertEquals("Should replace manifest with a rewritten manifest", 1, secondSnapshot.allManifests().size());
        // do not merge to keep the last snapshot's manifest valid
        table.newFastAppend().appendFile(FILE_C).commit();
        rightAfterSnapshot();
        long snapshotId = table.currentSnapshot().snapshotId();
        long tAfterCommits = rightAfterSnapshot();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).execute();
        replacedert.replacedertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId());
        replacedert.replacedertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId()));
        replacedert.replacedertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId()));
        replacedert.replacedertEquals("Should remove expired manifest lists and deleted data file", Sets.newHashSet(// snapshot expired
        firstSnapshot.manifestListLocation(), // manifest was rewritten for delete
        firstSnapshot.allManifests().get(0).path(), // snapshot expired
        secondSnapshot.manifestListLocation(), // deleted
        FILE_A.path()), deletedFiles);
        checkExpirationResults(1, 1, 2, result);
    }

    @Test
    public void testExpireOlderThanWithRollback() {
        // merge every commit
        table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit();
        table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit();
        Snapshot firstSnapshot = table.currentSnapshot();
        replacedert.replacedertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size());
        rightAfterSnapshot();
        table.newDelete().deleteFile(FILE_B).commit();
        Snapshot secondSnapshot = table.currentSnapshot();
        Set<ManifestFile> secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests());
        secondSnapshotManifests.removeAll(firstSnapshot.allManifests());
        replacedert.replacedertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size());
        table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit();
        long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId());
        long snapshotId = table.currentSnapshot().snapshotId();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).execute();
        replacedert.replacedertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId());
        replacedert.replacedertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId()));
        replacedert.replacedertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId()));
        replacedert.replacedertEquals("Should remove expired manifest lists and reverted appended data file", Sets.newHashSet(// snapshot expired
        secondSnapshot.manifestListLocation(), // manifest is no longer referenced
        Iterables.getOnlyElement(secondSnapshotManifests).path()), deletedFiles);
        checkExpirationResults(0, 1, 1, result);
    }

    @Test
    public void testExpireOlderThanWithRollbackAndMergedManifests() {
        table.newAppend().appendFile(FILE_A).commit();
        Snapshot firstSnapshot = table.currentSnapshot();
        replacedert.replacedertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size());
        rightAfterSnapshot();
        table.newAppend().appendFile(FILE_B).commit();
        Snapshot secondSnapshot = table.currentSnapshot();
        Set<ManifestFile> secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests());
        secondSnapshotManifests.removeAll(firstSnapshot.allManifests());
        replacedert.replacedertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size());
        table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit();
        long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId());
        long snapshotId = table.currentSnapshot().snapshotId();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).execute();
        replacedert.replacedertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId());
        replacedert.replacedertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId()));
        replacedert.replacedertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId()));
        replacedert.replacedertEquals("Should remove expired manifest lists and reverted appended data file", Sets.newHashSet(// snapshot expired
        secondSnapshot.manifestListLocation(), // manifest is no longer referenced
        Iterables.getOnlyElement(secondSnapshotManifests).path(), // added, but rolled back
        FILE_B.path()), deletedFiles);
        checkExpirationResults(1, 1, 1, result);
    }

    @Test
    public void testExpireOnEmptyTable() {
        Set<String> deletedFiles = Sets.newHashSet();
        // table has no data, testing ExpireSnapshots should not fail with no snapshot
        ExpireSnapshotsActionResult result = Actions.forTable(table).expireSnapshots().expireOlderThan(System.currentTimeMillis()).deleteWith(deletedFiles::add).execute();
        checkExpirationResults(0, 0, 0, result);
    }

    @Test
    public void testExpireAction() {
        table.newAppend().appendFile(FILE_A).commit();
        Snapshot firstSnapshot = table.currentSnapshot();
        rightAfterSnapshot();
        table.newAppend().appendFile(FILE_B).commit();
        long snapshotId = table.currentSnapshot().snapshotId();
        long tAfterCommits = rightAfterSnapshot();
        Set<String> deletedFiles = Sets.newHashSet();
        ExpireSnapshotsAction action = Actions.forTable(table).expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add);
        Dataset<Row> pendingDeletes = action.expire();
        List<Row> pending = pendingDeletes.collectAsList();
        replacedert.replacedertEquals("Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId());
        replacedert.replacedertNull("Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId()));
        replacedert.replacedertEquals("Pending deletes should contain one row", 1, pending.size());
        replacedert.replacedertEquals("Pending delete should be the expired manifest list location", firstSnapshot.manifestListLocation(), pending.get(0).getString(0));
        replacedert.replacedertEquals("Pending delete should be a manifest list", "Manifest List", pending.get(0).getString(1));
        replacedert.replacedertEquals("Should not delete any files", 0, deletedFiles.size());
        replacedert.replacedertSame("Multiple calls to expire should return the same deleted files", pendingDeletes, action.expire());
    }

    @Test
    public void testUseLocalIterator() {
        table.newFastAppend().appendFile(FILE_A).commit();
        table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit();
        table.newFastAppend().appendFile(FILE_C).commit();
        long end = rightAfterSnapshot();
        int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get();
        ExpireSnapshotsActionResult results = Actions.forTable(table).expireSnapshots().expireOlderThan(end).streamDeleteResults(true).execute();
        replacedert.replacedertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots()));
        int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get();
        int totalJobsRun = jobsAfter - jobsBefore;
        checkExpirationResults(1L, 1L, 2L, results);
        replacedert.replacedertTrue(String.format("Expected more than %d jobs when using local iterator, ran %d", SHUFFLE_PARreplacedIONS, totalJobsRun), totalJobsRun > SHUFFLE_PARreplacedIONS);
    }
}

17 Source : TestInputFormatWithEmptyTable.java
with Apache License 2.0
from ExpediaGroup

@Before
public void before() throws IOException {
    tableLocation = temp.newFolder();
    Schema schema = new Schema(required(1, "id", Types.LongType.get()), optional(2, "data", Types.StringType.get()));
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables();
    Table table = tables.create(schema, spec, tableLocation.getAbsolutePath());
}

17 Source : TestSparkDataWrite.java
with Apache License 2.0
from apache

@Test
public void testOverwrite() throws IOException {
    File parent = temp.newFolder(format.toString());
    File location = new File(parent, "test");
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("id").build();
    Table table = tables.create(SCHEMA, spec, location.toString());
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a"), new SimpleRecord(3, "c"), new SimpleRecord(4, "b"), new SimpleRecord(6, "c"));
    Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.clreplaced);
    df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    // overwrite with 2*id to replace record 2, append 4 and 6
    df.withColumn("id", df.col("id").multiply(2)).select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Overwrite).option("overwrite-mode", "dynamic").save(location.toString());
    table.refresh();
    Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

17 Source : TestSparkDataWrite.java
with Apache License 2.0
from apache

@Test
public void testUnparreplacedionedOverwrite() throws IOException {
    File parent = temp.newFolder(format.toString());
    File location = new File(parent, "test");
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Table table = tables.create(SCHEMA, spec, location.toString());
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    // overwrite with the same data; should not produce two copies
    df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Overwrite).save(location.toString());
    table.refresh();
    Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

17 Source : TestSparkDataWrite.java
with Apache License 2.0
from apache

@Test
public void testAppend() throws IOException {
    File parent = temp.newFolder(format.toString());
    File location = new File(parent, "test");
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();
    Table table = tables.create(SCHEMA, spec, location.toString());
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"), new SimpleRecord(4, "a"), new SimpleRecord(5, "b"), new SimpleRecord(6, "c"));
    Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.clreplaced);
    df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    df.withColumn("id", df.col("id").plus(3)).select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    table.refresh();
    Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

17 Source : TestPartitionValues.java
with Apache License 2.0
from apache

@Test
public void testReorderedColumns() throws Exception {
    String desc = "reorder_columns";
    File parent = temp.newFolder(desc);
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    replacedert.replacedertTrue("mkdirs should succeed", dataFolder.mkdirs());
    HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
    Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString());
    table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    df.select("data", "id").write().format("iceberg").mode(SaveMode.Append).option(SparkWriteOptions.CHECK_ORDERING, "false").save(location.toString());
    Dataset<Row> result = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

17 Source : TestPartitionValues.java
with Apache License 2.0
from apache

@Test
public void testNullParreplacedionValue() throws Exception {
    String desc = "null_part";
    File parent = temp.newFolder(desc);
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    replacedert.replacedertTrue("mkdirs should succeed", dataFolder.mkdirs());
    HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
    Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString());
    table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"), new SimpleRecord(4, null));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    df.select("id", "data").write().format("iceberg").mode(SaveMode.Append).save(location.toString());
    Dataset<Row> result = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

17 Source : TestPartitionValues.java
with Apache License 2.0
from apache

@Test
public void testReorderedColumnsNoNullability() throws Exception {
    String desc = "reorder_columns_no_nullability";
    File parent = temp.newFolder(desc);
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    replacedert.replacedertTrue("mkdirs should succeed", dataFolder.mkdirs());
    HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
    Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString());
    table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    df.select("data", "id").write().format("iceberg").mode(SaveMode.Append).option(SparkWriteOptions.CHECK_ORDERING, "false").option(SparkWriteOptions.CHECK_NULLABILITY, "false").save(location.toString());
    Dataset<Row> result = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

17 Source : TestForwardCompatibility.java
with Apache License 2.0
from apache

@Test
public void testSparkWriteFailsUnknownTransform() throws IOException {
    File parent = temp.newFolder("avro");
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    dataFolder.mkdirs();
    HadoopTables tables = new HadoopTables(CONF);
    tables.create(SCHEMA, UNKNOWN_SPEC, location.toString());
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    replacedertHelpers.replacedertThrows("Should reject write with unsupported transform", UnsupportedOperationException.clreplaced, "Cannot write using unsupported transforms: zero", () -> df.select("id", "data").write().format("iceberg").mode("append").save(location.toString()));
}

17 Source : TestInputFormatReaderDeletes.java
with Apache License 2.0
from apache

@RunWith(Parameterized.clreplaced)
public clreplaced TestInputFormatReaderDeletes extends DeleteReadTests {

    private final Configuration conf = new Configuration();

    private final HadoopTables tables = new HadoopTables(conf);

    private TestHelper helper;

    // parametrized variables
    private final String inputFormat;

    private final FileFormat fileFormat;

    @Parameterized.Parameters(name = "inputFormat = {0}, fileFormat={1}")
    public static Object[][] parameters() {
        return new Object[][] { { "IcebergInputFormat", FileFormat.PARQUET }, { "IcebergInputFormat", FileFormat.AVRO }, { "IcebergInputFormat", FileFormat.ORC }, { "MapredIcebergInputFormat", FileFormat.PARQUET }, { "MapredIcebergInputFormat", FileFormat.AVRO }, { "MapredIcebergInputFormat", FileFormat.ORC } };
    }

    public TestInputFormatReaderDeletes(String inputFormat, FileFormat fileFormat) {
        this.inputFormat = inputFormat;
        this.fileFormat = fileFormat;
    }

    @Override
    protected Table createTable(String name, Schema schema, ParreplacedionSpec spec) throws IOException {
        Table table;
        File location = temp.newFolder(inputFormat, fileFormat.name());
        replacedert.replacedertTrue(location.delete());
        helper = new TestHelper(conf, tables, location.toString(), schema, spec, fileFormat, temp);
        table = helper.createTable();
        TableOperations ops = ((BaseTable) table).operations();
        TableMetadata meta = ops.current();
        ops.commit(meta, meta.upgradeToFormatVersion(2));
        return table;
    }

    @Override
    protected void dropTable(String name) {
        tables.dropTable(helper.table().location());
    }

    @Override
    public StructLikeSet rowSet(String name, Table table, String... columns) {
        InputFormatConfig.ConfigBuilder builder = new InputFormatConfig.ConfigBuilder(conf).readFrom(table.location());
        Schema projected = table.schema().select(columns);
        StructLikeSet set = StructLikeSet.create(projected.replacedtruct());
        set.addAll(TestIcebergInputFormats.TESTED_INPUT_FORMATS.stream().filter(recordFactory -> recordFactory.name().equals(inputFormat)).map(recordFactory -> recordFactory.create(builder.project(projected).conf()).getRecords()).flatMap(List::stream).collect(Collectors.toList()));
        return set;
    }

    @Override
    protected boolean expectPruned() {
        return false;
    }
}

17 Source : TestIcebergInputFormats.java
with Apache License 2.0
from apache

@Before
public void before() throws IOException {
    conf = new Configuration();
    HadoopTables tables = new HadoopTables(conf);
    File location = temp.newFolder(testInputFormat.name(), fileFormat.name());
    replacedert.replacedertTrue(location.delete());
    helper = new TestHelper(conf, tables, location.toString(), SCHEMA, SPEC, fileFormat, temp);
    builder = new InputFormatConfig.ConfigBuilder(conf).readFrom(location.toString());
}

17 Source : TestSplitPlanning.java
with Apache License 2.0
from apache

@RunWith(Parameterized.clreplaced)
public clreplaced TestSplitPlanning extends TableTestBase {

    private static final Configuration CONF = new Configuration();

    private static final HadoopTables TABLES = new HadoopTables(CONF);

    private static final Schema SCHEMA = new Schema(optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()));

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private Table table = null;

    @Parameterized.Parameters(name = "formatVersion = {0}")
    public static Object[] parameters() {
        return new Object[] { 1, 2 };
    }

    public TestSplitPlanning(int formatVersion) {
        super(formatVersion);
    }

    @Before
    public void setupTable() throws IOException {
        File tableDir = temp.newFolder();
        String tableLocation = tableDir.toURI().toString();
        table = TABLES.create(SCHEMA, tableLocation);
        table.updateProperties().set(TableProperties.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)).set(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(4 * 1024 * 1024)).set(TableProperties.SPLIT_LOOKBACK, String.valueOf(Integer.MAX_VALUE)).commit();
    }

    @Test
    public void testBasicSplitPlanning() {
        List<DataFile> files128Mb = newFiles(4, 128 * 1024 * 1024);
        appendFiles(files128Mb);
        // we expect 4 bins since split size is 128MB and we have 4 files 128MB each
        replacedert.replacedertEquals(4, Iterables.size(table.newScan().planTasks()));
        List<DataFile> files32Mb = newFiles(16, 32 * 1024 * 1024);
        appendFiles(files32Mb);
        // we expect 8 bins after we add 16 files 32MB each as they will form additional 4 bins
        replacedert.replacedertEquals(8, Iterables.size(table.newScan().planTasks()));
    }

    @Test
    public void testSplitPlanningWithSmallFiles() {
        List<DataFile> files60Mb = newFiles(50, 60 * 1024 * 1024);
        List<DataFile> files5Kb = newFiles(370, 5 * 1024);
        Iterable<DataFile> files = Iterables.concat(files60Mb, files5Kb);
        appendFiles(files);
        // 50 files of size 60MB will form 25 bins as split size is 128MB
        // each of those bins will have 8MB left and all 370 files of size 5KB would end up
        // in one of them without "read.split.open-file-cost"
        // as "read.split.open-file-cost" is 4MB, each of the original 25 bins will get at most 2 files
        // so 50 of 370 files will be packed into the existing 25 bins and the remaining 320 files
        // will form additional 10 bins, resulting in 35 bins in total
        replacedert.replacedertEquals(35, Iterables.size(table.newScan().planTasks()));
    }

    @Test
    public void testSplitPlanningWithNoMinWeight() {
        table.updateProperties().set(TableProperties.SPLIT_OPEN_FILE_COST, "0").commit();
        List<DataFile> files60Mb = newFiles(2, 60 * 1024 * 1024);
        List<DataFile> files5Kb = newFiles(100, 5 * 1024);
        Iterable<DataFile> files = Iterables.concat(files60Mb, files5Kb);
        appendFiles(files);
        // all small files will be packed into one bin as "read.split.open-file-cost" is set to 0
        replacedert.replacedertEquals(1, Iterables.size(table.newScan().planTasks()));
    }

    @Test
    public void testSplitPlanningWithOverridenSize() {
        List<DataFile> files128Mb = newFiles(4, 128 * 1024 * 1024);
        appendFiles(files128Mb);
        // we expect 2 bins since we are overriding split size in scan with 256MB
        TableScan scan = table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(256L * 1024 * 1024));
        replacedert.replacedertEquals(2, Iterables.size(scan.planTasks()));
    }

    @Test
    public void testSplitPlanningWithOverriddenSizeForMetadataJsonFile() {
        List<DataFile> files8Mb = newFiles(32, 8 * 1024 * 1024, FileFormat.METADATA);
        appendFiles(files8Mb);
        // we expect 16 bins since we are overriding split size in scan with 16MB
        TableScan scan = table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(16L * 1024 * 1024));
        replacedert.replacedertEquals(16, Iterables.size(scan.planTasks()));
    }

    @Test
    public void testSplitPlanningWithOverriddenSizeForLargeMetadataJsonFile() {
        List<DataFile> files128Mb = newFiles(4, 128 * 1024 * 1024, FileFormat.METADATA);
        appendFiles(files128Mb);
        // although overriding split size in scan with 8MB, we expect 4 bins since metadata file is not splittable
        TableScan scan = table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(8L * 1024 * 1024));
        replacedert.replacedertEquals(4, Iterables.size(scan.planTasks()));
    }

    @Test
    public void testSplitPlanningWithOverridenLookback() {
        List<DataFile> files120Mb = newFiles(1, 120 * 1024 * 1024);
        List<DataFile> file128Mb = newFiles(1, 128 * 1024 * 1024);
        Iterable<DataFile> files = Iterables.concat(files120Mb, file128Mb);
        appendFiles(files);
        // we expect 2 bins from non-overriden table properties
        TableScan scan = table.newScan().option(TableProperties.SPLIT_LOOKBACK, "1");
        CloseableIterable<CombinedScanTask> tasks = scan.planTasks();
        replacedert.replacedertEquals(2, Iterables.size(tasks));
        // since lookback was overridden to 1, we expect the first bin to be the largest of the two.
        CombinedScanTask combinedScanTask = tasks.iterator().next();
        FileScanTask task = combinedScanTask.files().iterator().next();
        replacedert.replacedertEquals(128 * 1024 * 1024, task.length());
    }

    @Test
    public void testSplitPlanningWithOverridenOpenCostSize() {
        List<DataFile> files16Mb = newFiles(16, 16 * 1024 * 1024);
        appendFiles(files16Mb);
        // we expect 4 bins since we are overriding open file cost in scan with a cost of 32MB
        // we can fit at most 128Mb/32Mb = 4 files per bin
        TableScan scan = table.newScan().option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(32L * 1024 * 1024));
        replacedert.replacedertEquals(4, Iterables.size(scan.planTasks()));
    }

    private void appendFiles(Iterable<DataFile> files) {
        AppendFiles appendFiles = table.newAppend();
        files.forEach(appendFiles::appendFile);
        appendFiles.commit();
    }

    private List<DataFile> newFiles(int numFiles, long sizeInBytes) {
        return newFiles(numFiles, sizeInBytes, FileFormat.PARQUET);
    }

    private List<DataFile> newFiles(int numFiles, long sizeInBytes, FileFormat fileFormat) {
        List<DataFile> files = Lists.newArrayList();
        for (int fileNum = 0; fileNum < numFiles; fileNum++) {
            files.add(newFile(sizeInBytes, fileFormat));
        }
        return files;
    }

    private DataFile newFile(long sizeInBytes, FileFormat fileFormat) {
        String fileName = UUID.randomUUID().toString();
        return DataFiles.builder(ParreplacedionSpec.unparreplacedioned()).withPath(fileFormat.addExtension(fileName)).withFileSizeInBytes(sizeInBytes).withRecordCount(2).build();
    }
}

16 Source : TestInputFormatWithMultipleTasks.java
with Apache License 2.0
from ExpediaGroup

@Before
public void before() throws IOException {
    tableLocation = temp.newFolder();
    Schema schema = new Schema(required(1, "id", Types.LongType.get()), optional(2, "data", Types.StringType.get()));
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables();
    Table table = tables.create(schema, spec, tableLocation.getAbsolutePath());
    List<Record> data = new ArrayList<>();
    data.add(TestHelpers.createSimpleRecord(1L, "Michael"));
    data.add(TestHelpers.createSimpleRecord(2L, "Andy"));
    DataFile fileA = TestHelpers.writeFile(temp.newFile(), table, null, FileFormat.PARQUET, data);
    table.newAppend().appendFile(fileA).commit();
    DataFile fileB = TestHelpers.writeFile(temp.newFile(), table, null, FileFormat.PARQUET, data);
    table.newAppend().appendFile(fileB).commit();
    snapshotId = table.currentSnapshot().snapshotId();
}

16 Source : TestIcebergScan.java
with Apache License 2.0
from dremio

@Test
public void testParreplacedionMismatchSpecSchema() throws Exception {
    try (AutoCloseable c = enableIcebergTables()) {
        Path p = new Path(testRootPath);
        if (fs.exists(p)) {
            fs.delete(p, true);
        }
        fs.mkdirs(p);
        copyFromJar("iceberg/parreplacedionednation", java.nio.file.Paths.get(testRootPath));
        HadoopTables tables = new HadoopTables(conf);
        Table table = tables.load(testRootPath);
        // n_regionkey was renamed to regionkey
        replacedertNull(table.schema().findField("n_regionkey"));
        replacedertNotNull(table.schema().findField("regionkey"));
        replacedertEquals(1, table.spec().fields().size());
        // no change in parreplacedion spec
        replacedertEquals("n_regionkey", table.spec().fields().get(0).name());
        IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, new File(testRootPath).getAbsolutePath()).getTableInfo();
        replacedertEquals(1, tableInfo.getParreplacedionColumns().size());
        // parreplacedion column matches new column name
        replacedertEquals("regionkey", tableInfo.getParreplacedionColumns().get(0));
    }
}

16 Source : TestFilteredScan.java
with Apache License 2.0
from apache

@RunWith(Parameterized.clreplaced)
public clreplaced TestFilteredScan {

    private static final Configuration CONF = new Configuration();

    private static final HadoopTables TABLES = new HadoopTables(CONF);

    private static final Schema SCHEMA = new Schema(Types.NestedField.required(1, "id", Types.LongType.get()), Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), Types.NestedField.optional(3, "data", Types.StringType.get()));

    private static final ParreplacedionSpec BUCKET_BY_ID = ParreplacedionSpec.builderFor(SCHEMA).bucket("id", 4).build();

    private static final ParreplacedionSpec PARreplacedION_BY_DAY = ParreplacedionSpec.builderFor(SCHEMA).day("ts").build();

    private static final ParreplacedionSpec PARreplacedION_BY_HOUR = ParreplacedionSpec.builderFor(SCHEMA).hour("ts").build();

    private static final ParreplacedionSpec PARreplacedION_BY_DATA = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();

    private static final ParreplacedionSpec PARreplacedION_BY_ID = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("id").build();

    private static SparkSession spark = null;

    @BeforeClreplaced
    public static void startSpark() {
        TestFilteredScan.spark = SparkSession.builder().master("local[2]").getOrCreate();
        // define UDFs used by parreplacedion tests
        Transform<Long, Integer> bucket4 = Transforms.bucket(Types.LongType.get(), 4);
        spark.udf().register("bucket4", (UDF1<Long, Integer>) bucket4::apply, IntegerType$.MODULE$);
        Transform<Long, Integer> day = Transforms.day(Types.TimestampType.withZone());
        spark.udf().register("ts_day", (UDF1<Timestamp, Integer>) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), IntegerType$.MODULE$);
        Transform<Long, Integer> hour = Transforms.hour(Types.TimestampType.withZone());
        spark.udf().register("ts_hour", (UDF1<Timestamp, Integer>) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), IntegerType$.MODULE$);
        spark.udf().register("data_ident", (UDF1<String, String>) data -> data, StringType$.MODULE$);
        spark.udf().register("id_ident", (UDF1<Long, Long>) id -> id, LongType$.MODULE$);
    }

    @AfterClreplaced
    public static void stopSpark() {
        SparkSession currentSpark = TestFilteredScan.spark;
        TestFilteredScan.spark = null;
        currentSpark.stop();
    }

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private final String format;

    private final boolean vectorized;

    @Parameterized.Parameters(name = "format = {0}, vectorized = {1}")
    public static Object[][] parameters() {
        return new Object[][] { { "parquet", false }, { "parquet", true }, { "avro", false }, { "orc", false }, { "orc", true } };
    }

    public TestFilteredScan(String format, boolean vectorized) {
        this.format = format;
        this.vectorized = vectorized;
    }

    private File parent = null;

    private File unparreplacedioned = null;

    private List<Record> records = null;

    @Before
    public void writeUnparreplacedionedTable() throws IOException {
        this.parent = temp.newFolder("TestFilteredScan");
        this.unparreplacedioned = new File(parent, "unparreplacedioned");
        File dataFolder = new File(unparreplacedioned, "data");
        replacedert.replacedertTrue("Mkdir should succeed", dataFolder.mkdirs());
        Table table = TABLES.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), unparreplacedioned.toString());
        // use the table schema because ids are rereplacedigned
        Schema tableSchema = table.schema();
        FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
        File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));
        this.records = testRecords(tableSchema);
        try (FileAppender<Record> writer = new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) {
            writer.addAll(records);
        }
        DataFile file = DataFiles.builder(ParreplacedionSpec.unparreplacedioned()).withRecordCount(records.size()).withFileSizeInBytes(testFile.length()).withPath(testFile.toString()).build();
        table.newAppend().appendFile(file).commit();
    }

    @Test
    public void testUnparreplacedionedIDFilters() {
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unparreplacedioned.toString()));
        SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
        for (int i = 0; i < 10; i += 1) {
            pushFilters(builder, EqualTo.apply("id", i));
            Batch scan = builder.build().toBatch();
            InputParreplacedion[] parreplacedions = scan.planInputParreplacedions();
            replacedert.replacedertEquals("Should only create one task for a small file", 1, parreplacedions.length);
            // validate row filtering
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(i), read(unparreplacedioned.toString(), vectorized, "id = " + i));
        }
    }

    @Test
    public void testUnparreplacedionedCaseInsensitiveIDFilters() {
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unparreplacedioned.toString()));
        // set spark.sql.caseSensitive to false
        String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
        TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");
        try {
            for (int i = 0; i < 10; i += 1) {
                SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).caseSensitive(false);
                // note lower(ID) == lower(id), so there must be a match
                pushFilters(builder, EqualTo.apply("ID", i));
                Batch scan = builder.build().toBatch();
                InputParreplacedion[] tasks = scan.planInputParreplacedions();
                replacedert.replacedertEquals("Should only create one task for a small file", 1, tasks.length);
                // validate row filtering
                replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(i), read(unparreplacedioned.toString(), vectorized, "id = " + i));
            }
        } finally {
            // return global conf to previous state
            TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
        }
    }

    @Test
    public void testUnparreplacedionedTimestampFilter() {
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", unparreplacedioned.toString()));
        SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
        pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
        Batch scan = builder.build().toBatch();
        InputParreplacedion[] tasks = scan.planInputParreplacedions();
        replacedert.replacedertEquals("Should only create one task for a small file", 1, tasks.length);
        replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(5, 6, 7, 8, 9), read(unparreplacedioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
    }

    @Test
    public void testBucketParreplacedionedIDFilters() {
        Table table = buildParreplacedionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
        Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch();
        replacedert.replacedertEquals("Unfiltered table should created 4 read tasks", 4, unfiltered.planInputParreplacedions().length);
        for (int i = 0; i < 10; i += 1) {
            SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
            pushFilters(builder, EqualTo.apply("id", i));
            Batch scan = builder.build().toBatch();
            InputParreplacedion[] tasks = scan.planInputParreplacedions();
            // validate predicate push-down
            replacedert.replacedertEquals("Should create one task for a single bucket", 1, tasks.length);
            // validate row filtering
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(i), read(table.location(), vectorized, "id = " + i));
        }
    }

    @SuppressWarnings("checkstyle:AvoidNestedBlocks")
    @Test
    public void testDayParreplacedionedTimestampFilters() {
        Table table = buildParreplacedionedTable("parreplacedioned_by_day", PARreplacedION_BY_DAY, "ts_day", "ts");
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
        Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch();
        replacedert.replacedertEquals("Unfiltered table should created 2 read tasks", 2, unfiltered.planInputParreplacedions().length);
        {
            SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
            pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
            Batch scan = builder.build().toBatch();
            InputParreplacedion[] tasks = scan.planInputParreplacedions();
            replacedert.replacedertEquals("Should create one task for 2017-12-21", 1, tasks.length);
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(5, 6, 7, 8, 9), read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
        }
        {
            SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
            pushFilters(builder, And.apply(GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), LessThan.apply("ts", "2017-12-22T08:00:00+00:00")));
            Batch scan = builder.build().toBatch();
            InputParreplacedion[] tasks = scan.planInputParreplacedions();
            replacedert.replacedertEquals("Should create one task for 2017-12-22", 1, tasks.length);
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(1, 2), read(table.location(), vectorized, "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)"));
        }
    }

    @SuppressWarnings("checkstyle:AvoidNestedBlocks")
    @Test
    public void testHourParreplacedionedTimestampFilters() {
        Table table = buildParreplacedionedTable("parreplacedioned_by_hour", PARreplacedION_BY_HOUR, "ts_hour", "ts");
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
        Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch();
        replacedert.replacedertEquals("Unfiltered table should created 9 read tasks", 9, unfiltered.planInputParreplacedions().length);
        {
            SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
            pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
            Batch scan = builder.build().toBatch();
            InputParreplacedion[] tasks = scan.planInputParreplacedions();
            replacedert.replacedertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.length);
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(8, 9, 7, 6, 5), read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
        }
        {
            SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
            pushFilters(builder, And.apply(GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), LessThan.apply("ts", "2017-12-22T08:00:00+00:00")));
            Batch scan = builder.build().toBatch();
            InputParreplacedion[] tasks = scan.planInputParreplacedions();
            replacedert.replacedertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.length);
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(2, 1), read(table.location(), vectorized, "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)"));
        }
    }

    @SuppressWarnings("checkstyle:AvoidNestedBlocks")
    @Test
    public void testFilterByNonProjectedColumn() {
        {
            Schema actualProjection = SCHEMA.select("id", "data");
            List<Record> expected = Lists.newArrayList();
            for (Record rec : expected(5, 6, 7, 8, 9)) {
                expected.add(projectFlat(actualProjection, rec));
            }
            replacedertEqualsSafe(actualProjection.replacedtruct(), expected, read(unparreplacedioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", "id", "data"));
        }
        {
            // only project id: ts will be projected because of the filter, but data will not be included
            Schema actualProjection = SCHEMA.select("id");
            List<Record> expected = Lists.newArrayList();
            for (Record rec : expected(1, 2)) {
                expected.add(projectFlat(actualProjection, rec));
            }
            replacedertEqualsSafe(actualProjection.replacedtruct(), expected, read(unparreplacedioned.toString(), vectorized, "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", "id"));
        }
    }

    @Test
    public void testParreplacedionedByDataStartsWithFilter() {
        Table table = buildParreplacedionedTable("parreplacedioned_by_data", PARreplacedION_BY_DATA, "data_ident", "data");
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
        SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
        pushFilters(builder, new StringStartsWith("data", "junc"));
        Batch scan = builder.build().toBatch();
        replacedert.replacedertEquals(1, scan.planInputParreplacedions().length);
    }

    @Test
    public void testParreplacedionedByIdStartsWith() {
        Table table = buildParreplacedionedTable("parreplacedioned_by_id", PARreplacedION_BY_ID, "id_ident", "id");
        CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));
        SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);
        pushFilters(builder, new StringStartsWith("data", "junc"));
        Batch scan = builder.build().toBatch();
        replacedert.replacedertEquals(1, scan.planInputParreplacedions().length);
    }

    @Test
    public void testUnparreplacedionedStartsWith() {
        Dataset<Row> df = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(unparreplacedioned.toString());
        List<String> matchedData = df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList();
        replacedert.replacedertEquals(1, matchedData.size());
        replacedert.replacedertEquals("junction", matchedData.get(0));
    }

    private static Record projectFlat(Schema projection, Record record) {
        Record result = GenericRecord.create(projection);
        List<Types.NestedField> fields = projection.replacedtruct().fields();
        for (int i = 0; i < fields.size(); i += 1) {
            Types.NestedField field = fields.get(i);
            result.set(i, record.getField(field.name()));
        }
        return result;
    }

    public static void replacedertEqualsUnsafe(Types.StructType struct, List<Record> expected, List<UnsafeRow> actual) {
        // TODO: match records by ID
        int numRecords = Math.min(expected.size(), actual.size());
        for (int i = 0; i < numRecords; i += 1) {
            GenericsHelpers.replacedertEqualsUnsafe(struct, expected.get(i), actual.get(i));
        }
        replacedert.replacedertEquals("Number of results should match expected", expected.size(), actual.size());
    }

    public static void replacedertEqualsSafe(Types.StructType struct, List<Record> expected, List<Row> actual) {
        // TODO: match records by ID
        int numRecords = Math.min(expected.size(), actual.size());
        for (int i = 0; i < numRecords; i += 1) {
            GenericsHelpers.replacedertEqualsSafe(struct, expected.get(i), actual.get(i));
        }
        replacedert.replacedertEquals("Number of results should match expected", expected.size(), actual.size());
    }

    private List<Record> expected(int... ordinals) {
        List<Record> expected = Lists.newArrayListWithExpectedSize(ordinals.length);
        for (int ord : ordinals) {
            expected.add(records.get(ord));
        }
        return expected;
    }

    private void pushFilters(ScanBuilder scan, Filter... filters) {
        replacedert.replacedertTrue(scan instanceof SupportsPushDownFilters);
        SupportsPushDownFilters filterable = (SupportsPushDownFilters) scan;
        filterable.pushFilters(filters);
    }

    private Table buildParreplacedionedTable(String desc, ParreplacedionSpec spec, String udf, String parreplacedionColumn) {
        File location = new File(parent, desc);
        Table table = TABLES.create(SCHEMA, spec, location.toString());
        // Do not combine or split files because the tests expect a split per parreplacedion.
        // A target split size of 2048 helps us achieve that.
        table.updateProperties().set("read.split.target-size", "2048").commit();
        // copy the unparreplacedioned table into the parreplacedioned table to produce the parreplacedioned data
        Dataset<Row> allRows = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(unparreplacedioned.toString());
        allRows.coalesce(// ensure only 1 file per parreplacedion is written
        1).withColumn("part", callUDF(udf, column(parreplacedionColumn))).sortWithinParreplacedions("part").drop("part").write().format("iceberg").mode("append").save(table.location());
        table.refresh();
        return table;
    }

    private List<Record> testRecords(Schema schema) {
        return Lists.newArrayList(record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"), record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"), record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""), record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"), record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"), record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"), record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish"));
    }

    private static List<Row> read(String table, boolean vectorized, String expr) {
        return read(table, vectorized, expr, "*");
    }

    private static List<Row> read(String table, boolean vectorized, String expr, String select0, String... selectN) {
        Dataset<Row> dataset = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(table).filter(expr).select(select0, selectN);
        return dataset.collectAsList();
    }

    private static OffsetDateTime parse(String timestamp) {
        return OffsetDateTime.parse(timestamp);
    }

    private static Record record(Schema schema, Object... values) {
        Record rec = GenericRecord.create(schema);
        for (int i = 0; i < values.length; i += 1) {
            rec.set(i, values[i]);
        }
        return rec;
    }
}

16 Source : TestFilteredScan.java
with Apache License 2.0
from apache

@RunWith(Parameterized.clreplaced)
public clreplaced TestFilteredScan {

    private static final Configuration CONF = new Configuration();

    private static final HadoopTables TABLES = new HadoopTables(CONF);

    private static final Schema SCHEMA = new Schema(Types.NestedField.required(1, "id", Types.LongType.get()), Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), Types.NestedField.optional(3, "data", Types.StringType.get()));

    private static final ParreplacedionSpec BUCKET_BY_ID = ParreplacedionSpec.builderFor(SCHEMA).bucket("id", 4).build();

    private static final ParreplacedionSpec PARreplacedION_BY_DAY = ParreplacedionSpec.builderFor(SCHEMA).day("ts").build();

    private static final ParreplacedionSpec PARreplacedION_BY_HOUR = ParreplacedionSpec.builderFor(SCHEMA).hour("ts").build();

    private static final ParreplacedionSpec PARreplacedION_BY_DATA = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();

    private static final ParreplacedionSpec PARreplacedION_BY_ID = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("id").build();

    private static SparkSession spark = null;

    @BeforeClreplaced
    public static void startSpark() {
        TestFilteredScan.spark = SparkSession.builder().master("local[2]").getOrCreate();
        // define UDFs used by parreplacedion tests
        Transform<Long, Integer> bucket4 = Transforms.bucket(Types.LongType.get(), 4);
        spark.udf().register("bucket4", (UDF1<Long, Integer>) bucket4::apply, IntegerType$.MODULE$);
        Transform<Long, Integer> day = Transforms.day(Types.TimestampType.withZone());
        spark.udf().register("ts_day", (UDF1<Timestamp, Integer>) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), IntegerType$.MODULE$);
        Transform<Long, Integer> hour = Transforms.hour(Types.TimestampType.withZone());
        spark.udf().register("ts_hour", (UDF1<Timestamp, Integer>) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), IntegerType$.MODULE$);
        spark.udf().register("data_ident", (UDF1<String, String>) data -> data, StringType$.MODULE$);
        spark.udf().register("id_ident", (UDF1<Long, Long>) id -> id, LongType$.MODULE$);
    }

    @AfterClreplaced
    public static void stopSpark() {
        SparkSession currentSpark = TestFilteredScan.spark;
        TestFilteredScan.spark = null;
        currentSpark.stop();
    }

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private final String format;

    private final boolean vectorized;

    @Parameterized.Parameters(name = "format = {0}, vectorized = {1}")
    public static Object[][] parameters() {
        return new Object[][] { { "parquet", false }, { "parquet", true }, { "avro", false }, { "orc", false }, { "orc", true } };
    }

    public TestFilteredScan(String format, boolean vectorized) {
        this.format = format;
        this.vectorized = vectorized;
    }

    private File parent = null;

    private File unparreplacedioned = null;

    private List<Record> records = null;

    @Before
    public void writeUnparreplacedionedTable() throws IOException {
        this.parent = temp.newFolder("TestFilteredScan");
        this.unparreplacedioned = new File(parent, "unparreplacedioned");
        File dataFolder = new File(unparreplacedioned, "data");
        replacedert.replacedertTrue("Mkdir should succeed", dataFolder.mkdirs());
        Table table = TABLES.create(SCHEMA, ParreplacedionSpec.unparreplacedioned(), unparreplacedioned.toString());
        // use the table schema because ids are rereplacedigned
        Schema tableSchema = table.schema();
        FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
        File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));
        // create records using the table's schema
        this.records = testRecords(tableSchema);
        try (FileAppender<Record> writer = new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) {
            writer.addAll(records);
        }
        DataFile file = DataFiles.builder(ParreplacedionSpec.unparreplacedioned()).withRecordCount(records.size()).withFileSizeInBytes(testFile.length()).withPath(testFile.toString()).build();
        table.newAppend().appendFile(file).commit();
    }

    @Test
    public void testUnparreplacedionedIDFilters() {
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", unparreplacedioned.toString()));
        IcebergSource source = new IcebergSource();
        for (int i = 0; i < 10; i += 1) {
            DataSourceReader reader = source.createReader(options);
            pushFilters(reader, EqualTo.apply("id", i));
            List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
            replacedert.replacedertEquals("Should only create one task for a small file", 1, tasks.size());
            // validate row filtering
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(i), read(unparreplacedioned.toString(), vectorized, "id = " + i));
        }
    }

    @Test
    public void testUnparreplacedionedCaseInsensitiveIDFilters() {
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", unparreplacedioned.toString()));
        // set spark.sql.caseSensitive to false
        String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
        TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");
        try {
            IcebergSource source = new IcebergSource();
            for (int i = 0; i < 10; i += 1) {
                DataSourceReader reader = source.createReader(options);
                // note lower(ID) == lower(id), so there must be a match
                pushFilters(reader, EqualTo.apply("ID", i));
                List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
                replacedert.replacedertEquals("Should only create one task for a small file", 1, tasks.size());
                // validate row filtering
                replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(i), read(unparreplacedioned.toString(), vectorized, "id = " + i));
            }
        } finally {
            // return global conf to previous state
            TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
        }
    }

    @Test
    public void testUnparreplacedionedTimestampFilter() {
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", unparreplacedioned.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader reader = source.createReader(options);
        pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
        List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
        replacedert.replacedertEquals("Should only create one task for a small file", 1, tasks.size());
        replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(5, 6, 7, 8, 9), read(unparreplacedioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
    }

    @Test
    public void testBucketParreplacedionedIDFilters() {
        File location = buildParreplacedionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader unfiltered = source.createReader(options);
        replacedert.replacedertEquals("Unfiltered table should created 4 read tasks", 4, unfiltered.planInputParreplacedions().size());
        for (int i = 0; i < 10; i += 1) {
            DataSourceReader reader = source.createReader(options);
            pushFilters(reader, EqualTo.apply("id", i));
            List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
            // validate predicate push-down
            replacedert.replacedertEquals("Should create one task for a single bucket", 1, tasks.size());
            // validate row filtering
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(i), read(location.toString(), vectorized, "id = " + i));
        }
    }

    @SuppressWarnings("checkstyle:AvoidNestedBlocks")
    @Test
    public void testDayParreplacedionedTimestampFilters() {
        File location = buildParreplacedionedTable("parreplacedioned_by_day", PARreplacedION_BY_DAY, "ts_day", "ts");
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader unfiltered = source.createReader(options);
        replacedert.replacedertEquals("Unfiltered table should created 2 read tasks", 2, unfiltered.planInputParreplacedions().size());
        {
            DataSourceReader reader = source.createReader(options);
            pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
            List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
            replacedert.replacedertEquals("Should create one task for 2017-12-21", 1, tasks.size());
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(5, 6, 7, 8, 9), read(location.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
        }
        {
            DataSourceReader reader = source.createReader(options);
            pushFilters(reader, And.apply(GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), LessThan.apply("ts", "2017-12-22T08:00:00+00:00")));
            List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
            replacedert.replacedertEquals("Should create one task for 2017-12-22", 1, tasks.size());
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(1, 2), read(location.toString(), vectorized, "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)"));
        }
    }

    @SuppressWarnings("checkstyle:AvoidNestedBlocks")
    @Test
    public void testHourParreplacedionedTimestampFilters() {
        File location = buildParreplacedionedTable("parreplacedioned_by_hour", PARreplacedION_BY_HOUR, "ts_hour", "ts");
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader unfiltered = source.createReader(options);
        replacedert.replacedertEquals("Unfiltered table should created 9 read tasks", 9, unfiltered.planInputParreplacedions().size());
        {
            DataSourceReader reader = source.createReader(options);
            pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));
            List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
            replacedert.replacedertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.size());
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(8, 9, 7, 6, 5), read(location.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
        }
        {
            DataSourceReader reader = source.createReader(options);
            pushFilters(reader, And.apply(GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), LessThan.apply("ts", "2017-12-22T08:00:00+00:00")));
            List<InputParreplacedion<InternalRow>> tasks = reader.planInputParreplacedions();
            replacedert.replacedertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.size());
            replacedertEqualsSafe(SCHEMA.replacedtruct(), expected(2, 1), read(location.toString(), vectorized, "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)"));
        }
    }

    @SuppressWarnings("checkstyle:AvoidNestedBlocks")
    @Test
    public void testFilterByNonProjectedColumn() {
        {
            Schema actualProjection = SCHEMA.select("id", "data");
            List<Record> expected = Lists.newArrayList();
            for (Record rec : expected(5, 6, 7, 8, 9)) {
                expected.add(projectFlat(actualProjection, rec));
            }
            replacedertEqualsSafe(actualProjection.replacedtruct(), expected, read(unparreplacedioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", "id", "data"));
        }
        {
            // only project id: ts will be projected because of the filter, but data will not be included
            Schema actualProjection = SCHEMA.select("id");
            List<Record> expected = Lists.newArrayList();
            for (Record rec : expected(1, 2)) {
                expected.add(projectFlat(actualProjection, rec));
            }
            replacedertEqualsSafe(actualProjection.replacedtruct(), expected, read(unparreplacedioned.toString(), vectorized, "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", "id"));
        }
    }

    @Test
    public void testInFilter() {
        File location = buildParreplacedionedTable("parreplacedioned_by_data", PARreplacedION_BY_DATA, "data_ident", "data");
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader reader = source.createReader(options);
        pushFilters(reader, new In("data", new String[] { "foo", "junction", "brush", null }));
        replacedert.replacedertEquals(2, reader.planInputParreplacedions().size());
    }

    @Test
    public void testInFilterForTimestamp() {
        File location = buildParreplacedionedTable("parreplacedioned_by_hour", PARreplacedION_BY_HOUR, "ts_hour", "ts");
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader reader = source.createReader(options);
        pushFilters(reader, new In("ts", new Timestamp[] { new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000), new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000), new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000), new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000), null }));
        replacedert.replacedertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputParreplacedions().size());
    }

    @Test
    public void testParreplacedionedByDataStartsWithFilter() {
        File location = buildParreplacedionedTable("parreplacedioned_by_data", PARreplacedION_BY_DATA, "data_ident", "data");
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader reader = source.createReader(options);
        pushFilters(reader, new StringStartsWith("data", "junc"));
        replacedert.replacedertEquals(1, reader.planInputParreplacedions().size());
    }

    @Test
    public void testParreplacedionedByIdStartsWith() {
        File location = buildParreplacedionedTable("parreplacedioned_by_id", PARreplacedION_BY_ID, "id_ident", "id");
        DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString()));
        IcebergSource source = new IcebergSource();
        DataSourceReader reader = source.createReader(options);
        pushFilters(reader, new StringStartsWith("data", "junc"));
        replacedert.replacedertEquals(1, reader.planInputParreplacedions().size());
    }

    @Test
    public void testUnparreplacedionedStartsWith() {
        Dataset<Row> df = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(unparreplacedioned.toString());
        List<String> matchedData = df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList();
        replacedert.replacedertEquals(1, matchedData.size());
        replacedert.replacedertEquals("junction", matchedData.get(0));
    }

    private static Record projectFlat(Schema projection, Record record) {
        Record result = GenericRecord.create(projection);
        List<Types.NestedField> fields = projection.replacedtruct().fields();
        for (int i = 0; i < fields.size(); i += 1) {
            Types.NestedField field = fields.get(i);
            result.set(i, record.getField(field.name()));
        }
        return result;
    }

    public static void replacedertEqualsUnsafe(Types.StructType struct, List<Record> expected, List<UnsafeRow> actual) {
        // TODO: match records by ID
        int numRecords = Math.min(expected.size(), actual.size());
        for (int i = 0; i < numRecords; i += 1) {
            GenericsHelpers.replacedertEqualsUnsafe(struct, expected.get(i), actual.get(i));
        }
        replacedert.replacedertEquals("Number of results should match expected", expected.size(), actual.size());
    }

    public static void replacedertEqualsSafe(Types.StructType struct, List<Record> expected, List<Row> actual) {
        // TODO: match records by ID
        int numRecords = Math.min(expected.size(), actual.size());
        for (int i = 0; i < numRecords; i += 1) {
            GenericsHelpers.replacedertEqualsSafe(struct, expected.get(i), actual.get(i));
        }
        replacedert.replacedertEquals("Number of results should match expected", expected.size(), actual.size());
    }

    private List<Record> expected(int... ordinals) {
        List<Record> expected = Lists.newArrayListWithExpectedSize(ordinals.length);
        for (int ord : ordinals) {
            expected.add(records.get(ord));
        }
        return expected;
    }

    private void pushFilters(DataSourceReader reader, Filter... filters) {
        replacedert.replacedertTrue(reader instanceof SupportsPushDownFilters);
        SupportsPushDownFilters filterable = (SupportsPushDownFilters) reader;
        filterable.pushFilters(filters);
    }

    private File buildParreplacedionedTable(String desc, ParreplacedionSpec spec, String udf, String parreplacedionColumn) {
        File location = new File(parent, desc);
        Table byId = TABLES.create(SCHEMA, spec, location.toString());
        // Do not combine or split files because the tests expect a split per parreplacedion.
        // A target split size of 2048 helps us achieve that.
        byId.updateProperties().set("read.split.target-size", "2048").commit();
        // copy the unparreplacedioned table into the parreplacedioned table to produce the parreplacedioned data
        Dataset<Row> allRows = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(unparreplacedioned.toString());
        allRows.coalesce(// ensure only 1 file per parreplacedion is written
        1).withColumn("part", callUDF(udf, column(parreplacedionColumn))).sortWithinParreplacedions("part").drop("part").write().format("iceberg").mode("append").save(byId.location());
        return location;
    }

    private List<Record> testRecords(Schema schema) {
        return Lists.newArrayList(record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"), record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"), record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""), record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"), record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"), record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"), record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish"));
    }

    private static List<Row> read(String table, boolean vectorized, String expr) {
        return read(table, vectorized, expr, "*");
    }

    private static List<Row> read(String table, boolean vectorized, String expr, String select0, String... selectN) {
        Dataset<Row> dataset = spark.read().format("iceberg").option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load(table).filter(expr).select(select0, selectN);
        return dataset.collectAsList();
    }

    private static OffsetDateTime parse(String timestamp) {
        return OffsetDateTime.parse(timestamp);
    }

    private static long instant(String timestamp) {
        return Literal.of(timestamp).<Long>to(Types.TimestampType.withZone()).value();
    }

    private static Record record(Schema schema, Object... values) {
        Record rec = GenericRecord.create(schema);
        for (int i = 0; i < values.length; i += 1) {
            rec.set(i, values[i]);
        }
        return rec;
    }
}

16 Source : SnapshotFunctionalityTest.java
with Apache License 2.0
from apache

@Before
public void before() throws IOException {
    Schema schema = new Schema(optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()));
    spark = SparkSession.builder().master("local[2]").getOrCreate();
    tableLocation = Files.createTempDirectory("temp").toFile();
    HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    table = tables.create(schema, spec, tableLocation.toString());
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    for (int i = 0; i < 5; i++) {
        df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation.toString());
    }
    table.refresh();
}

16 Source : TestSparkDataWrite.java
with Apache License 2.0
from apache

@Test
public void testWriteProjection() throws IOException {
    replacedume.replacedumeTrue("Not supported in Spark 3.0; replacedysis requires all columns are present", spark.version().startsWith("2"));
    File parent = temp.newFolder(format.toString());
    File location = new File(parent, "test");
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Table table = tables.create(SCHEMA, spec, location.toString());
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    // select only id column
    df.select("id").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    table.refresh();
    Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

16 Source : TestSparkDataWrite.java
with Apache License 2.0
from apache

@Test
public void testViewsReturnRecentResults() throws IOException {
    File parent = temp.newFolder(format.toString());
    File location = new File(parent, "test");
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();
    tables.create(SCHEMA, spec, location.toString());
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.clreplaced);
    df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    Dataset<Row> query = spark.read().format("iceberg").load(location.toString()).where("id = 1");
    query.createOrReplaceTempView("tmp");
    List<SimpleRecord> actual1 = spark.table("tmp").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    List<SimpleRecord> expected1 = Lists.newArrayList(new SimpleRecord(1, "a"));
    replacedert.replacedertEquals("Number of rows should match", expected1.size(), actual1.size());
    replacedert.replacedertEquals("Result rows should match", expected1, actual1);
    df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    List<SimpleRecord> actual2 = spark.table("tmp").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    List<SimpleRecord> expected2 = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a"));
    replacedert.replacedertEquals("Number of rows should match", expected2.size(), actual2.size());
    replacedert.replacedertEquals("Result rows should match", expected2, actual2);
}

16 Source : TestSparkDataWrite.java
with Apache License 2.0
from apache

@Test
public void testWriteProjectionWithMiddle() throws IOException {
    replacedume.replacedumeTrue("Not supported in Spark 3.0; replacedysis requires all columns are present", spark.version().startsWith("2"));
    File parent = temp.newFolder(format.toString());
    File location = new File(parent, "test");
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Schema schema = new Schema(optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()));
    Table table = tables.create(schema, spec, location.toString());
    List<ThreeColumnRecord> expected = Lists.newArrayList(new ThreeColumnRecord(1, null, "hello"), new ThreeColumnRecord(2, null, "world"), new ThreeColumnRecord(3, null, null));
    Dataset<Row> df = spark.createDataFrame(expected, ThreeColumnRecord.clreplaced);
    df.select("c1", "c3").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    table.refresh();
    Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
    List<ThreeColumnRecord> actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
}

16 Source : TestDataSourceOptions.java
with Apache License 2.0
from apache

@Test
public void testHadoopOptions() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    Configuration sparkHadoopConf = spark.sessionState().newHadoopConf();
    String originalDefaultFS = sparkHadoopConf.get("fs.default.name");
    try {
        HadoopTables tables = new HadoopTables(CONF);
        ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
        Map<String, String> options = Maps.newHashMap();
        tables.create(SCHEMA, spec, options, tableLocation);
        // set an invalid value for 'fs.default.name' in Spark Hadoop config
        // to verify that 'hadoop.' data source options are propagated correctly
        sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000");
        List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"));
        Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
        originalDf.select("id", "data").write().format("iceberg").mode("append").option("hadoop.fs.default.name", "file:///").save(tableLocation);
        Dataset<Row> resultDf = spark.read().format("iceberg").option("hadoop.fs.default.name", "file:///").load(tableLocation);
        List<SimpleRecord> resultRecords = resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
        replacedert.replacedertEquals("Records should match", expectedRecords, resultRecords);
    } finally {
        sparkHadoopConf.set("fs.default.name", originalDefaultFS);
    }
}

16 Source : TestDataSourceOptions.java
with Apache License 2.0
from apache

@Test
public void testNoWriteFormatOption() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro");
    Table table = tables.create(SCHEMA, spec, options, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
        tasks.forEach(task -> {
            FileFormat fileFormat = FileFormat.fromFileName(task.file().path());
            replacedert.replacedertEquals(FileFormat.AVRO, fileFormat);
        });
    }
}

16 Source : TestDataSourceOptions.java
with Apache License 2.0
from apache

@Test
public void testIncrementalScanOptions() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> options = Maps.newHashMap();
    Table table = tables.create(SCHEMA, spec, options, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"), new SimpleRecord(4, "d"));
    for (SimpleRecord record : expectedRecords) {
        Dataset<Row> originalDf = spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.clreplaced);
        originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    }
    List<Long> snapshotIds = SnapshotUtil.currentAncestors(table);
    // start-snapshot-id and snapshot-id are both configured.
    replacedertHelpers.replacedertThrows("Check both start-snapshot-id and snapshot-id are configured", IllegalArgumentException.clreplaced, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> {
        spark.read().format("iceberg").option("snapshot-id", snapshotIds.get(3).toString()).option("start-snapshot-id", snapshotIds.get(3).toString()).load(tableLocation).explain();
    });
    // end-snapshot-id and as-of-timestamp are both configured.
    replacedertHelpers.replacedertThrows("Check both start-snapshot-id and snapshot-id are configured", IllegalArgumentException.clreplaced, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> {
        spark.read().format("iceberg").option(SparkReadOptions.AS_OF_TIMESTAMP, Long.toString(table.snapshot(snapshotIds.get(3)).timestampMillis())).option("end-snapshot-id", snapshotIds.get(2).toString()).load(tableLocation).explain();
    });
    // only end-snapshot-id is configured.
    replacedertHelpers.replacedertThrows("Check both start-snapshot-id and snapshot-id are configured", IllegalArgumentException.clreplaced, "Cannot only specify option end-snapshot-id to do incremental scan", () -> {
        spark.read().format("iceberg").option("end-snapshot-id", snapshotIds.get(2).toString()).load(tableLocation).explain();
    });
    // test (1st snapshot, current snapshot] incremental scan.
    List<SimpleRecord> result = spark.read().format("iceberg").option("start-snapshot-id", snapshotIds.get(3).toString()).load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Records should match", expectedRecords.subList(1, 4), result);
    // test (2nd snapshot, 3rd snapshot] incremental scan.
    List<SimpleRecord> result1 = spark.read().format("iceberg").option("start-snapshot-id", snapshotIds.get(2).toString()).option("end-snapshot-id", snapshotIds.get(1).toString()).load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Records should match", expectedRecords.subList(2, 3), result1);
}

16 Source : TestSplitScan.java
with Apache License 2.0
from apache

@RunWith(Parameterized.clreplaced)
public clreplaced TestSplitScan {

    private static final Configuration CONF = new Configuration();

    private static final HadoopTables TABLES = new HadoopTables(CONF);

    private static final long SPLIT_SIZE = 16 * 1024 * 1024;

    private static final Schema SCHEMA = new Schema(required(1, "id", Types.IntegerType.get()), required(2, "data", Types.StringType.get()));

    private Table table;

    private File tableLocation;

    @Rule
    public TemporaryFolder temp = new TemporaryFolder();

    private List<Record> expectedRecords;

    @Parameterized.Parameters(name = "format = {0}")
    public static Object[] parameters() {
        return new Object[] { "parquet", "avro" };
    }

    private final FileFormat format;

    public TestSplitScan(String format) {
        this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
    }

    @Before
    public void before() throws IOException {
        tableLocation = new File(temp.newFolder(), "table");
        setupTable();
    }

    @Test
    public void test() {
        replacedert.replacedertEquals("There should be 4 tasks created since file size is approximately close to 64MB and split size 16MB", 4, Lists.newArrayList(table.newScan().planTasks()).size());
        List<Record> records = Lists.newArrayList(IcebergGenerics.read(table).build());
        replacedert.replacedertEquals(expectedRecords.size(), records.size());
        for (int i = 0; i < expectedRecords.size(); i++) {
            replacedert.replacedertEquals(expectedRecords.get(i), records.get(i));
        }
    }

    private void setupTable() throws IOException {
        table = TABLES.create(SCHEMA, tableLocation.toString());
        table.updateProperties().set(TableProperties.SPLIT_SIZE, String.valueOf(SPLIT_SIZE)).commit();
        // With these number of records and the given SCHEMA
        // we can effectively write a file of approximate size 64 MB
        int numRecords = 2500000;
        expectedRecords = RandomGenericData.generate(SCHEMA, numRecords, 0L);
        File file = writeToFile(expectedRecords, format);
        DataFile dataFile = DataFiles.builder(ParreplacedionSpec.unparreplacedioned()).withRecordCount(expectedRecords.size()).withFileSizeInBytes(file.length()).withPath(file.toString()).withFormat(format).build();
        table.newAppend().appendFile(dataFile).commit();
    }

    private File writeToFile(List<Record> records, FileFormat fileFormat) throws IOException {
        File file = temp.newFile();
        replacedert.replacedertTrue(file.delete());
        GenericAppenderFactory factory = new GenericAppenderFactory(SCHEMA).set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(SPLIT_SIZE));
        try (FileAppender<Record> appender = factory.newAppender(Files.localOutput(file), fileFormat)) {
            appender.addAll(records);
        }
        return file;
    }
}

15 Source : TestJoinTablesWithHadoopTables.java
with Apache License 2.0
from ExpediaGroup

@Before
public void before() throws IOException {
    tableLocationA = temp.newFolder("table_a");
    tableLocationB = temp.newFolder("table_b");
    Schema schemaA = new Schema(optional(1, "first_name", Types.StringType.get()), optional(2, "salary", Types.LongType.get()), optional(3, "id", Types.LongType.get()));
    Schema schemaB = new Schema(optional(1, "name", Types.StringType.get()), optional(2, "salary", Types.LongType.get()));
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    HadoopTables tables = new HadoopTables();
    Table tableA = tables.create(schemaA, spec, tableLocationA.getAbsolutePath());
    Table tableB = tables.create(schemaB, spec, tableLocationB.getAbsolutePath());
    List<Record> tableAData = new ArrayList<>();
    tableAData.add(TestHelpers.createCustomRecord(schemaA, Arrays.asList("Ella", 3000L, 1L)));
    tableAData.add(TestHelpers.createCustomRecord(schemaA, Arrays.asList("Jean", 5000L, 2L)));
    tableAData.add(TestHelpers.createCustomRecord(schemaA, Arrays.asList("Joe", 2000L, 3L)));
    DataFile fileA = TestHelpers.writeFile(temp.newFile(), tableA, null, FileFormat.PARQUET, tableAData);
    List<Record> tableBData = new ArrayList<>();
    tableBData.add(TestHelpers.createCustomRecord(schemaB, Arrays.asList("Michael", 3000L)));
    tableBData.add(TestHelpers.createCustomRecord(schemaB, Arrays.asList("Andy", 3000L)));
    tableBData.add(TestHelpers.createCustomRecord(schemaB, Arrays.asList("Berta", 4000L)));
    DataFile fileB = TestHelpers.writeFile(temp.newFile(), tableB, null, FileFormat.PARQUET, tableBData);
    tableA.newAppend().appendFile(fileA).commit();
    tableB.newAppend().appendFile(fileB).commit();
    shell.start();
}

15 Source : TestSparkDataWrite.java
with Apache License 2.0
from apache

@Test
public void testUnparreplacedionedCreateWithTargetFileSizeViaTableProperties() throws IOException {
    File parent = temp.newFolder(format.toString());
    File location = new File(parent, "test");
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Table table = tables.create(SCHEMA, spec, location.toString());
    table.updateProperties().set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, // ~4 bytes; low enough to trigger
    "4").commit();
    List<SimpleRecord> expected = Lists.newArrayListWithCapacity(4000);
    for (int i = 0; i < 4000; i++) {
        expected.add(new SimpleRecord(i, "a"));
    }
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
    df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
    table.refresh();
    Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
    replacedert.replacedertEquals("Result rows should match", expected, actual);
    List<DataFile> files = Lists.newArrayList();
    for (ManifestFile manifest : table.currentSnapshot().allManifests()) {
        for (DataFile file : ManifestFiles.read(manifest, table.io())) {
            files.add(file);
        }
    }
    // TODO: ORC file now not support target file size
    if (!format.equals(FileFormat.ORC)) {
        replacedert.replacedertEquals("Should have 4 DataFiles", 4, files.size());
        replacedert.replacedertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000));
    }
}

See More Examples