Here are the examples of the java api org.apache.iceberg.FileFormat taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
89 Examples
19
Source : IcebergQueryRunner.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public static DistributedQueryRunner createIcebergQueryRunner(Map<String, String> extraProperties) throws Exception {
FileFormat defaultFormat = new IcebergConfig().getFileFormat();
return createIcebergQueryRunner(extraProperties, defaultFormat, TpchTable.getTables());
}
19
Source : IcebergQueryRunner.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public static DistributedQueryRunner createIcebergQueryRunner(Map<String, String> extraProperties, FileFormat format) throws Exception {
return createIcebergQueryRunner(extraProperties, format, TpchTable.getTables());
}
19
Source : IcebergQueryRunner.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public static DistributedQueryRunner createIcebergQueryRunner(Map<String, String> extraProperties, List<TpchTable<?>> tpchTables) throws Exception {
FileFormat defaultFormat = new IcebergConfig().getFileFormat();
return createIcebergQueryRunner(extraProperties, defaultFormat, tpchTables);
}
19
Source : TestFlinkTableSource.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public clreplaced TestFlinkTableSource extends FlinkTestBase {
private static final String CATALOG_NAME = "test_catalog";
private static final String DATABASE_NAME = "test_db";
private static final String TABLE_NAME = "test_table";
private final FileFormat format = FileFormat.AVRO;
private static String warehouse;
private int scanEventCount = 0;
private ScanEvent lastScanEvent = null;
public TestFlinkTableSource() {
// register a scan event listener to validate pushdown
Listeners.register(event -> {
scanEventCount += 1;
lastScanEvent = event;
}, ScanEvent.clreplaced);
}
@Override
protected TableEnvironment getTableEnv() {
super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1);
return super.getTableEnv();
}
@BeforeClreplaced
public static void createWarehouse() throws IOException {
File warehouseFile = TEMPORARY_FOLDER.newFolder();
replacedert.replacedertTrue("The warehouse should be deleted", warehouseFile.delete());
// before variables
warehouse = "file:" + warehouseFile;
}
@Before
public void before() {
sql("CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", CATALOG_NAME, warehouse);
sql("USE CATALOG %s", CATALOG_NAME);
sql("CREATE DATABASE %s", DATABASE_NAME);
sql("USE %s", DATABASE_NAME);
sql("CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", TABLE_NAME, format.name());
sql("INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", TABLE_NAME);
this.scanEventCount = 0;
this.lastScanEvent = null;
}
@After
public void clean() {
sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, TABLE_NAME);
sql("DROP DATABASE IF EXISTS %s", DATABASE_NAME);
sql("DROP CATALOG IF EXISTS %s", CATALOG_NAME);
}
@Test
public void testLimitPushDown() {
String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME);
String explain = getTableEnv().explainSql(querySql);
String expectedExplain = "limit=[1]";
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
replacedert.replacedertTrue("Explain should contain LimitPushDown", explain.contains(expectedExplain));
List<Object[]> result = sql(querySql);
replacedert.replacedertEquals("Should have 1 record", 1, result.size());
replacedert.replacedertArrayEquals("Should produce the expected records", expectRecord, result.get(0));
replacedertHelpers.replacedertThrows("Invalid limit number: -1 ", SqlParserException.clreplaced, () -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME));
replacedert.replacedertEquals("Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size());
String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME);
List<Object[]> resultExceed = sql(sqlLimitExceed);
replacedert.replacedertEquals("Should have 3 records", 3, resultExceed.size());
List<Object[]> expectedList = Lists.newArrayList();
expectedList.add(new Object[] { 1, "iceberg", 10.0 });
expectedList.add(new Object[] { 2, "b", 20.0 });
expectedList.add(new Object[] { 3, null, 30.0 });
replacedert.replacedertArrayEquals("Should produce the expected records", expectedList.toArray(), resultExceed.toArray());
String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME);
List<Object[]> mixedResult = sql(sqlMixed);
replacedert.replacedertEquals("Should have 1 record", 1, mixedResult.size());
replacedert.replacedertArrayEquals("Should produce the expected records", expectRecord, mixedResult.get(0));
}
@Test
public void testNoFilterPushDown() {
String sql = String.format("SELECT * FROM %s ", TABLE_NAME);
List<Object[]> result = sql(sql);
List<Object[]> expectedRecords = Lists.newArrayList();
expectedRecords.add(new Object[] { 1, "iceberg", 10.0 });
expectedRecords.add(new Object[] { 2, "b", 20.0 });
expectedRecords.add(new Object[] { 3, null, 30.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedRecords.toArray(), result.toArray());
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
}
@Test
public void testFilterPushDownEqual() {
String sqlLiteralRight = String.format("SELECT * FROM %s WHERE id = 1 ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") == 1";
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
List<Object[]> result = sql(sqlLiteralRight);
replacedert.replacedertEquals("Should have 1 record", 1, result.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, result.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownEqualNull() {
String sqlEqualNull = String.format("SELECT * FROM %s WHERE data = NULL ", TABLE_NAME);
List<Object[]> result = sql(sqlEqualNull);
replacedert.replacedertEquals("Should have 0 record", 0, result.size());
replacedert.replacedertNull("Should not push down a filter", lastScanEvent);
}
@Test
public void testFilterPushDownEqualLiteralOnLeft() {
String sqlLiteralLeft = String.format("SELECT * FROM %s WHERE 1 = id ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") == 1";
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
List<Object[]> resultLeft = sql(sqlLiteralLeft);
replacedert.replacedertEquals("Should have 1 record", 1, resultLeft.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLeft.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownNoEqual() {
String sqlNE = String.format("SELECT * FROM %s WHERE id <> 1 ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") != 1";
List<Object[]> resultNE = sql(sqlNE);
replacedert.replacedertEquals("Should have 2 records", 2, resultNE.size());
List<Object[]> expectedNE = Lists.newArrayList();
expectedNE.add(new Object[] { 2, "b", 20.0 });
expectedNE.add(new Object[] { 3, null, 30.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedNE.toArray(), resultNE.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownNoEqualNull() {
String sqlNotEqualNull = String.format("SELECT * FROM %s WHERE data <> NULL ", TABLE_NAME);
List<Object[]> resultNE = sql(sqlNotEqualNull);
replacedert.replacedertEquals("Should have 0 records", 0, resultNE.size());
replacedert.replacedertNull("Should not push down a filter", lastScanEvent);
}
@Test
public void testFilterPushDownAnd() {
String sqlAnd = String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME);
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
List<Object[]> resultAnd = sql(sqlAnd);
replacedert.replacedertEquals("Should have 1 record", 1, resultAnd.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultAnd.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")";
replacedert.replacedertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownOr() {
String sqlOr = String.format("SELECT * FROM %s WHERE id = 1 OR data = 'b' ", TABLE_NAME);
String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"data\") == \"b\")";
List<Object[]> resultOr = sql(sqlOr);
replacedert.replacedertEquals("Should have 2 record", 2, resultOr.size());
List<Object[]> expectedOR = Lists.newArrayList();
expectedOR.add(new Object[] { 1, "iceberg", 10.0 });
expectedOR.add(new Object[] { 2, "b", 20.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedOR.toArray(), resultOr.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownGreaterThan() {
String sqlGT = String.format("SELECT * FROM %s WHERE id > 1 ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") > 1";
List<Object[]> resultGT = sql(sqlGT);
replacedert.replacedertEquals("Should have 2 record", 2, resultGT.size());
List<Object[]> expectedGT = Lists.newArrayList();
expectedGT.add(new Object[] { 2, "b", 20.0 });
expectedGT.add(new Object[] { 3, null, 30.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedGT.toArray(), resultGT.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownGreaterThanNull() {
String sqlGT = String.format("SELECT * FROM %s WHERE data > null ", TABLE_NAME);
List<Object[]> resultGT = sql(sqlGT);
replacedert.replacedertEquals("Should have 0 record", 0, resultGT.size());
replacedert.replacedertNull("Should not push down a filter", lastScanEvent);
}
@Test
public void testFilterPushDownGreaterThanLiteralOnLeft() {
String sqlGT = String.format("SELECT * FROM %s WHERE 3 > id ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") < 3";
List<Object[]> resultGT = sql(sqlGT);
replacedert.replacedertEquals("Should have 2 records", 2, resultGT.size());
List<Object[]> expectedGT = Lists.newArrayList();
expectedGT.add(new Object[] { 1, "iceberg", 10.0 });
expectedGT.add(new Object[] { 2, "b", 20.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedGT.toArray(), resultGT.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownGreaterThanEqual() {
String sqlGTE = String.format("SELECT * FROM %s WHERE id >= 2 ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") >= 2";
List<Object[]> resultGTE = sql(sqlGTE);
replacedert.replacedertEquals("Should have 2 records", 2, resultGTE.size());
List<Object[]> expectedGTE = Lists.newArrayList();
expectedGTE.add(new Object[] { 2, "b", 20.0 });
expectedGTE.add(new Object[] { 3, null, 30.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedGTE.toArray(), resultGTE.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownGreaterThanEqualNull() {
String sqlGTE = String.format("SELECT * FROM %s WHERE data >= null ", TABLE_NAME);
List<Object[]> resultGT = sql(sqlGTE);
replacedert.replacedertEquals("Should have 0 record", 0, resultGT.size());
replacedert.replacedertNull("Should not push down a filter", lastScanEvent);
}
@Test
public void testFilterPushDownGreaterThanEqualLiteralOnLeft() {
String sqlGTE = String.format("SELECT * FROM %s WHERE 2 >= id ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") <= 2";
List<Object[]> resultGTE = sql(sqlGTE);
replacedert.replacedertEquals("Should have 2 records", 2, resultGTE.size());
List<Object[]> expectedGTE = Lists.newArrayList();
expectedGTE.add(new Object[] { 1, "iceberg", 10.0 });
expectedGTE.add(new Object[] { 2, "b", 20.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedGTE.toArray(), resultGTE.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownLessThan() {
String sqlLT = String.format("SELECT * FROM %s WHERE id < 2 ", TABLE_NAME);
String expectedFilter = "ref(name=\"id\") < 2";
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
List<Object[]> resultLT = sql(sqlLT);
replacedert.replacedertEquals("Should have 1 record", 1, resultLT.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLT.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownLessThanNull() {
String sqlLT = String.format("SELECT * FROM %s WHERE data < null ", TABLE_NAME);
List<Object[]> resultGT = sql(sqlLT);
replacedert.replacedertEquals("Should have 0 record", 0, resultGT.size());
replacedert.replacedertNull("Should not push down a filter", lastScanEvent);
}
@Test
public void testFilterPushDownLessThanLiteralOnLeft() {
String sqlLT = String.format("SELECT * FROM %s WHERE 2 < id ", TABLE_NAME);
Object[] expectRecord = new Object[] { 3, null, 30.0 };
String expectedFilter = "ref(name=\"id\") > 2";
List<Object[]> resultLT = sql(sqlLT);
replacedert.replacedertEquals("Should have 1 record", 1, resultLT.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLT.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownLessThanEqual() {
String sqlLTE = String.format("SELECT * FROM %s WHERE id <= 1 ", TABLE_NAME);
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
String expectedFilter = "ref(name=\"id\") <= 1";
List<Object[]> resultLTE = sql(sqlLTE);
replacedert.replacedertEquals("Should have 1 record", 1, resultLTE.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLTE.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownLessThanEqualNull() {
String sqlLTE = String.format("SELECT * FROM %s WHERE data <= null ", TABLE_NAME);
List<Object[]> resultGT = sql(sqlLTE);
replacedert.replacedertEquals("Should have 0 record", 0, resultGT.size());
replacedert.replacedertNull("Should not push down a filter", lastScanEvent);
}
@Test
public void testFilterPushDownLessThanEqualLiteralOnLeft() {
String sqlLTE = String.format("SELECT * FROM %s WHERE 3 <= id ", TABLE_NAME);
Object[] expectRecord = new Object[] { 3, null, 30.0 };
String expectedFilter = "ref(name=\"id\") >= 3";
List<Object[]> resultLTE = sql(sqlLTE);
replacedert.replacedertEquals("Should have 1 record", 1, resultLTE.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLTE.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownIn() {
String sqlIN = String.format("SELECT * FROM %s WHERE id IN (1,2) ", TABLE_NAME);
String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"id\") == 2)";
List<Object[]> resultIN = sql(sqlIN);
replacedert.replacedertEquals("Should have 2 records", 2, resultIN.size());
List<Object[]> expectedIN = Lists.newArrayList();
expectedIN.add(new Object[] { 1, "iceberg", 10.0 });
expectedIN.add(new Object[] { 2, "b", 20.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedIN.toArray(), resultIN.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownInNull() {
String sqlInNull = String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME);
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
List<Object[]> result = sql(sqlInNull);
replacedert.replacedertEquals("Should have 1 record", 1, result.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, result.get(0));
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
}
@Test
public void testFilterPushDownNotIn() {
String sqlNotIn = String.format("SELECT * FROM %s WHERE id NOT IN (3,2) ", TABLE_NAME);
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
List<Object[]> resultNotIn = sql(sqlNotIn);
replacedert.replacedertEquals("Should have 1 record", 1, resultNotIn.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultNotIn.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)";
replacedert.replacedertEquals("Should contain the push down filter", expectedScan, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownNotInNull() {
String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME);
List<Object[]> resultGT = sql(sqlNotInNull);
replacedert.replacedertEquals("Should have 0 record", 0, resultGT.size());
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
}
@Test
public void testFilterPushDownIsNotNull() {
String sqlNotNull = String.format("SELECT * FROM %s WHERE data IS NOT NULL", TABLE_NAME);
String expectedFilter = "not_null(ref(name=\"data\"))";
List<Object[]> resultNotNull = sql(sqlNotNull);
replacedert.replacedertEquals("Should have 2 record", 2, resultNotNull.size());
List<Object[]> expected = Lists.newArrayList();
expected.add(new Object[] { 1, "iceberg", 10.0 });
expected.add(new Object[] { 2, "b", 20.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expected.toArray(), resultNotNull.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownIsNull() {
String sqlNull = String.format("SELECT * FROM %s WHERE data IS NULL", TABLE_NAME);
Object[] expectRecord = new Object[] { 3, null, 30.0 };
String expectedFilter = "is_null(ref(name=\"data\"))";
List<Object[]> resultNull = sql(sqlNull);
replacedert.replacedertEquals("Should have 1 record", 1, resultNull.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultNull.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownNot() {
String sqlNot = String.format("SELECT * FROM %s WHERE NOT (id = 1 OR id = 2 ) ", TABLE_NAME);
Object[] expectRecord = new Object[] { 3, null, 30.0 };
List<Object[]> resultNot = sql(sqlNot);
replacedert.replacedertEquals("Should have 1 record", 1, resultNot.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultNot.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)";
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownBetween() {
String sqlBetween = String.format("SELECT * FROM %s WHERE id BETWEEN 1 AND 2 ", TABLE_NAME);
List<Object[]> resultBetween = sql(sqlBetween);
replacedert.replacedertEquals("Should have 2 record", 2, resultBetween.size());
List<Object[]> expectedBetween = Lists.newArrayList();
expectedBetween.add(new Object[] { 1, "iceberg", 10.0 });
expectedBetween.add(new Object[] { 2, "b", 20.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedBetween.toArray(), resultBetween.toArray());
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)";
replacedert.replacedertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownNotBetween() {
String sqlNotBetween = String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME);
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)";
List<Object[]> resultNotBetween = sql(sqlNotBetween);
replacedert.replacedertEquals("Should have 1 record", 1, resultNotBetween.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultNotBetween.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterPushDownLike() {
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
String expectedFilter = "ref(name=\"data\") startsWith \"\"ice\"\"";
String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' ";
List<Object[]> resultLike = sql(sqlLike);
replacedert.replacedertEquals("Should have 1 record", 1, resultLike.size());
replacedert.replacedertArrayEquals("The like result should produce the expected record", expectRecord, resultLike.get(0));
replacedert.replacedertEquals("Should create only one scan", 1, scanEventCount);
replacedert.replacedertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString());
}
@Test
public void testFilterNotPushDownLike() {
Object[] expectRecord = new Object[] { 1, "iceberg", 10.0 };
String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' ";
List<Object[]> resultLike = sql(sqlNoPushDown);
replacedert.replacedertEquals("Should have 1 record", 0, resultLike.size());
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' ";
resultLike = sql(sqlNoPushDown);
replacedert.replacedertEquals("Should have 1 record", 1, resultLike.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLike.get(0));
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' ";
resultLike = sql(sqlNoPushDown);
replacedert.replacedertEquals("Should have 1 record", 1, resultLike.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLike.get(0));
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' ";
resultLike = sql(sqlNoPushDown);
replacedert.replacedertEquals("Should have 3 records", 3, resultLike.size());
List<Object[]> expectedRecords = Lists.newArrayList();
expectedRecords.add(new Object[] { 1, "iceberg", 10.0 });
expectedRecords.add(new Object[] { 2, "b", 20.0 });
expectedRecords.add(new Object[] { 3, null, 30.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedRecords.toArray(), resultLike.toArray());
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' ";
resultLike = sql(sqlNoPushDown);
replacedert.replacedertEquals("Should have 1 record", 1, resultLike.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLike.get(0));
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' ";
resultLike = sql(sqlNoPushDown);
replacedert.replacedertEquals("Should have 1 record", 1, resultLike.size());
replacedert.replacedertArrayEquals("Should produce the expected record", expectRecord, resultLike.get(0));
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
}
@Test
public void testFilterPushDown2Literal() {
String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME);
List<Object[]> result = sql(sql2Literal);
List<Object[]> expectedRecords = Lists.newArrayList();
expectedRecords.add(new Object[] { 1, "iceberg", 10.0 });
expectedRecords.add(new Object[] { 2, "b", 20.0 });
expectedRecords.add(new Object[] { 3, null, 30.0 });
replacedert.replacedertArrayEquals("Should produce the expected record", expectedRecords.toArray(), result.toArray());
replacedert.replacedertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter());
}
/**
* NaN is not supported by flink now, so we add the test case to replacedert the parse error, when we upgrade the flink
* that supports NaN, we will delele the method, and add some test case to test NaN.
*/
@Test
public void testSqlParseError() {
String sqlParseErrorEqual = String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME);
replacedertHelpers.replacedertThrows("The NaN is not supported by flink now. ", NumberFormatException.clreplaced, () -> sql(sqlParseErrorEqual));
String sqlParseErrorNotEqual = String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME);
replacedertHelpers.replacedertThrows("The NaN is not supported by flink now. ", NumberFormatException.clreplaced, () -> sql(sqlParseErrorNotEqual));
String sqlParseErrorGT = String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME);
replacedertHelpers.replacedertThrows("The NaN is not supported by flink now. ", NumberFormatException.clreplaced, () -> sql(sqlParseErrorGT));
String sqlParseErrorLT = String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME);
replacedertHelpers.replacedertThrows("The NaN is not supported by flink now. ", NumberFormatException.clreplaced, () -> sql(sqlParseErrorLT));
String sqlParseErrorGTE = String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME);
replacedertHelpers.replacedertThrows("The NaN is not supported by flink now. ", NumberFormatException.clreplaced, () -> sql(sqlParseErrorGTE));
String sqlParseErrorLTE = String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME);
replacedertHelpers.replacedertThrows("The NaN is not supported by flink now. ", NumberFormatException.clreplaced, () -> sql(sqlParseErrorLTE));
}
}
19
Source : TestFlinkCatalogTablePartitions.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public clreplaced TestFlinkCatalogTableParreplacedions extends FlinkCatalogTestBase {
private String tableName = "test_table";
private final FileFormat format;
@Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}")
public static Iterable<Object[]> parameters() {
List<Object[]> parameters = Lists.newArrayList();
for (FileFormat format : new FileFormat[] { FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET }) {
for (Boolean cacheEnabled : new Boolean[] { true, false }) {
for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) {
String catalogName = (String) catalogParams[0];
Namespace baseNamespace = (Namespace) catalogParams[1];
parameters.add(new Object[] { catalogName, baseNamespace, format, cacheEnabled });
}
}
}
return parameters;
}
public TestFlinkCatalogTableParreplacedions(String catalogName, Namespace baseNamespace, FileFormat format, boolean cacheEnabled) {
super(catalogName, baseNamespace);
this.format = format;
config.put(CACHE_ENABLED, String.valueOf(cacheEnabled));
}
@Before
public void before() {
super.before();
sql("CREATE DATABASE %s", flinkDatabase);
sql("USE CATALOG %s", catalogName);
sql("USE %s", DATABASE);
}
@After
public void cleanNamespaces() {
sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName);
sql("DROP DATABASE IF EXISTS %s", flinkDatabase);
super.clean();
}
@Test
public void testListParreplacedionsWithUnparreplacedionedTable() {
sql("CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", tableName, format.name());
sql("INSERT INTO %s SELECT 1,'a'", tableName);
ObjectPath objectPath = new ObjectPath(DATABASE, tableName);
FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get();
replacedertHelpers.replacedertThrows("Should not list parreplacedions for unparreplacedioned table.", TableNotParreplacedionedException.clreplaced, () -> flinkCatalog.listParreplacedions(objectPath));
}
@Test
public void testListParreplacedionsWithParreplacedionedTable() throws TableNotExistException, TableNotParreplacedionedException {
sql("CREATE TABLE %s (id INT, data VARCHAR) PARreplacedIONED BY (data) " + "with ('write.format.default'='%s')", tableName, format.name());
sql("INSERT INTO %s SELECT 1,'a'", tableName);
sql("INSERT INTO %s SELECT 2,'b'", tableName);
ObjectPath objectPath = new ObjectPath(DATABASE, tableName);
FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get();
List<CatalogParreplacedionSpec> list = flinkCatalog.listParreplacedions(objectPath);
replacedert.replacedertEquals("Should have 2 parreplacedion", 2, list.size());
List<CatalogParreplacedionSpec> expected = Lists.newArrayList();
CatalogParreplacedionSpec parreplacedionSpec1 = new CatalogParreplacedionSpec(ImmutableMap.of("data", "a"));
CatalogParreplacedionSpec parreplacedionSpec2 = new CatalogParreplacedionSpec(ImmutableMap.of("data", "b"));
expected.add(parreplacedionSpec1);
expected.add(parreplacedionSpec2);
replacedert.replacedertEquals("Should produce the expected catalog parreplacedion specs.", list, expected);
}
}
19
Source : TestFlinkCatalogTablePartitions.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}")
public static Iterable<Object[]> parameters() {
List<Object[]> parameters = Lists.newArrayList();
for (FileFormat format : new FileFormat[] { FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET }) {
for (Boolean cacheEnabled : new Boolean[] { true, false }) {
for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) {
String catalogName = (String) catalogParams[0];
Namespace baseNamespace = (Namespace) catalogParams[1];
parameters.add(new Object[] { catalogName, baseNamespace, format, cacheEnabled });
}
}
}
return parameters;
}
18
Source : IcebergWritableTableHandle.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public clreplaced IcebergWritableTableHandle implements ConnectorInsertTableHandle, ConnectorOutputTableHandle {
private final String schemaName;
private final String tableName;
private final String schemaAsJson;
private final String parreplacedionSpecAsJson;
private final List<IcebergColumnHandle> inputColumns;
private final String outputPath;
private final FileFormat fileFormat;
@JsonCreator
public IcebergWritableTableHandle(@JsonProperty("schemaName") String schemaName, @JsonProperty("tableName") String tableName, @JsonProperty("schemaAsJson") String schemaAsJson, @JsonProperty("parreplacedionSpecAsJson") String parreplacedionSpecAsJson, @JsonProperty("inputColumns") List<IcebergColumnHandle> inputColumns, @JsonProperty("outputPath") String outputPath, @JsonProperty("fileFormat") FileFormat fileFormat) {
this.schemaName = requireNonNull(schemaName, "schemaName is null");
this.tableName = requireNonNull(tableName, "tableName is null");
this.schemaAsJson = requireNonNull(schemaAsJson, "schemaAsJson is null");
this.parreplacedionSpecAsJson = requireNonNull(parreplacedionSpecAsJson, "parreplacedionSpecAsJson is null");
this.inputColumns = ImmutableList.copyOf(requireNonNull(inputColumns, "inputColumns is null"));
this.outputPath = requireNonNull(outputPath, "filePrefix is null");
this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
}
@JsonProperty
public String getSchemaName() {
return schemaName;
}
@JsonProperty
public String getTableName() {
return tableName;
}
@JsonProperty
public String getSchemaAsJson() {
return schemaAsJson;
}
@JsonProperty
public String getParreplacedionSpecAsJson() {
return parreplacedionSpecAsJson;
}
@JsonProperty
public List<IcebergColumnHandle> getInputColumns() {
return inputColumns;
}
@JsonProperty
public String getOutputPath() {
return outputPath;
}
@JsonProperty
public FileFormat getFileFormat() {
return fileFormat;
}
@Override
public String toString() {
return schemaName + "." + tableName;
}
}
18
Source : TestSparkReadProjection.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public abstract clreplaced TestSparkReadProjection extends TestReadProjection {
private static SparkSession spark = null;
@Parameterized.Parameters(name = "format = {0}, vectorized = {1}")
public static Object[][] parameters() {
return new Object[][] { { "parquet", false }, { "parquet", true }, { "avro", false }, { "orc", false }, { "orc", true } };
}
private final FileFormat format;
private final boolean vectorized;
public TestSparkReadProjection(String format, boolean vectorized) {
super(format);
this.format = FileFormat.valueOf(format.toUpperCase(Locale.ROOT));
this.vectorized = vectorized;
}
@BeforeClreplaced
public static void startSpark() {
TestSparkReadProjection.spark = SparkSession.builder().master("local[2]").getOrCreate();
ImmutableMap<String, String> config = ImmutableMap.of("type", "hive", "default-namespace", "default", "parquet-enabled", "true", "cache-enabled", "false");
spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog");
config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value));
}
@AfterClreplaced
public static void stopSpark() {
SparkSession currentSpark = TestSparkReadProjection.spark;
TestSparkReadProjection.spark = null;
currentSpark.stop();
}
@Override
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException {
File parent = temp.newFolder(desc);
File location = new File(parent, "test");
File dataFolder = new File(location, "data");
replacedert.replacedertTrue("mkdirs should succeed", dataFolder.mkdirs());
File testFile = new File(dataFolder, format.addExtension(UUID.randomUUID().toString()));
Table table = TestTables.create(location, desc, writeSchema, ParreplacedionSpec.unparreplacedioned());
try {
// Important: use the table's schema for the rest of the test
// When tables are created, the column ids are rereplacedigned.
Schema tableSchema = table.schema();
try (FileAppender<Record> writer = new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), format)) {
writer.add(record);
}
DataFile file = DataFiles.builder(ParreplacedionSpec.unparreplacedioned()).withRecordCount(100).withFileSizeInBytes(testFile.length()).withPath(testFile.toString()).build();
table.newAppend().appendFile(file).commit();
// rewrite the read schema for the table's rereplacedigned ids
Map<Integer, Integer> idMapping = Maps.newHashMap();
for (int id : allIds(writeSchema)) {
// translate each id to the original schema's column name, then to the new schema's id
String originalName = writeSchema.findColumnName(id);
idMapping.put(id, tableSchema.findField(originalName).fieldId());
}
Schema expectedSchema = rereplacedignIds(readSchema, idMapping);
// Set the schema to the expected schema directly to simulate the table schema evolving
TestTables.replaceMetadata(desc, TestTables.readMetadata(desc).updateSchema(expectedSchema, 100));
Dataset<Row> df = spark.read().format("org.apache.iceberg.spark.source.TestIcebergSource").option("iceberg.table.name", desc).option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)).load();
return SparkValueConverter.convert(readSchema, df.collectAsList().get(0));
} finally {
TestTables.clearTables();
}
}
private List<Integer> allIds(Schema schema) {
List<Integer> ids = Lists.newArrayList();
TypeUtil.visit(schema, new TypeUtil.SchemaVisitor<Void>() {
@Override
public Void field(Types.NestedField field, Void fieldResult) {
ids.add(field.fieldId());
return null;
}
@Override
public Void list(Types.ListType list, Void elementResult) {
ids.add(list.elementId());
return null;
}
@Override
public Void map(Types.MapType map, Void keyResult, Void valueResult) {
ids.add(map.keyId());
ids.add(map.valueId());
return null;
}
});
return ids;
}
private Schema rereplacedignIds(Schema schema, Map<Integer, Integer> idMapping) {
return new Schema(TypeUtil.visit(schema, new TypeUtil.SchemaVisitor<Type>() {
private int mapId(int id) {
if (idMapping.containsKey(id)) {
return idMapping.get(id);
}
// make sure the new IDs don't conflict with rereplacedignment
return 1000 + id;
}
@Override
public Type schema(Schema schema, Type structResult) {
return structResult;
}
@Override
public Type struct(Types.StructType struct, List<Type> fieldResults) {
List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(fieldResults.size());
List<Types.NestedField> fields = struct.fields();
for (int i = 0; i < fields.size(); i += 1) {
Types.NestedField field = fields.get(i);
if (field.isOptional()) {
newFields.add(optional(mapId(field.fieldId()), field.name(), fieldResults.get(i)));
} else {
newFields.add(required(mapId(field.fieldId()), field.name(), fieldResults.get(i)));
}
}
return Types.StructType.of(newFields);
}
@Override
public Type field(Types.NestedField field, Type fieldResult) {
return fieldResult;
}
@Override
public Type list(Types.ListType list, Type elementResult) {
if (list.isElementOptional()) {
return Types.ListType.ofOptional(mapId(list.elementId()), elementResult);
} else {
return Types.ListType.ofRequired(mapId(list.elementId()), elementResult);
}
}
@Override
public Type map(Types.MapType map, Type keyResult, Type valueResult) {
if (map.isValueOptional()) {
return Types.MapType.ofOptional(mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult);
} else {
return Types.MapType.ofRequired(mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult);
}
}
@Override
public Type primitive(Type.PrimitiveType primitive) {
return primitive;
}
}).asNestedType().replacedtructType().fields());
}
}
18
Source : SparkAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public DataWriter<InternalRow> newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike parreplacedion) {
return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, file.encryptingOutputFile().location(), spec, parreplacedion, file.keyMetadata());
}
18
Source : TestHiveIcebergStorageHandlerWithEngine.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Parameters(name = "fileFormat={0}, engine={1}, catalog={2}")
public static Collection<Object[]> parameters() {
Collection<Object[]> testParams = new ArrayList<>();
String javaVersion = System.getProperty("java.specification.version");
// Run tests with every FileFormat for a single Catalog (HiveCatalog)
for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) {
for (String engine : EXECUTION_ENGINES) {
// include Tez tests only for Java 8
if (javaVersion.equals("1.8") || "mr".equals(engine)) {
testParams.add(new Object[] { fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG });
}
}
}
// Run tests for every Catalog for a single FileFormat (PARQUET) and execution engine (mr)
// skip HiveCatalog tests as they are added before
for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) {
if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) {
testParams.add(new Object[] { FileFormat.PARQUET, "mr", testTableType });
}
}
return testParams;
}
18
Source : TestHiveIcebergStorageHandlerLocalScan.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Parameters(name = "fileFormat={0}, catalog={1}")
public static Collection<Object[]> parameters() {
Collection<Object[]> testParams = new ArrayList<>();
// Run tests with every FileFormat for a single Catalog (HiveCatalog)
for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) {
testParams.add(new Object[] { fileFormat, TestTables.TestTableType.HIVE_CATALOG });
}
// Run tests for every Catalog for a single FileFormat (PARQUET) - skip HiveCatalog tests as they are added before
for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) {
if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) {
testParams.add(new Object[] { FileFormat.PARQUET, testTableType });
}
}
return testParams;
}
18
Source : TestStreamScanSql.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public clreplaced TestStreamScanSql extends FlinkCatalogTestBase {
private static final String TABLE = "test_table";
private static final FileFormat FORMAT = FileFormat.PARQUET;
private TableEnvironment tEnv;
public TestStreamScanSql(String catalogName, Namespace baseNamespace) {
super(catalogName, baseNamespace);
}
@Override
protected TableEnvironment getTableEnv() {
if (tEnv == null) {
synchronized (this) {
if (tEnv == null) {
EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLreplacedLOADER_CHECK_CONFIG);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.enableCheckpointing(400);
StreamTableEnvironment streamTableEnv = StreamTableEnvironment.create(env, settingsBuilder.build());
streamTableEnv.getConfig().getConfiguration().set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true);
tEnv = streamTableEnv;
}
}
}
return tEnv;
}
@Before
public void before() {
super.before();
sql("CREATE DATABASE %s", flinkDatabase);
sql("USE CATALOG %s", catalogName);
sql("USE %s", DATABASE);
}
@After
public void clean() {
sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE);
sql("DROP DATABASE IF EXISTS %s", flinkDatabase);
super.clean();
}
private void insertRows(String parreplacedion, Table table, Row... rows) throws IOException {
GenericAppenderHelper appender = new GenericAppenderHelper(table, FORMAT, TEMPORARY_FOLDER);
GenericRecord gRecord = GenericRecord.create(table.schema());
List<Record> records = Lists.newArrayList();
for (Row row : rows) {
records.add(gRecord.copy("id", row.getField(0), "data", row.getField(1), "dt", row.getField(2)));
}
if (parreplacedion != null) {
appender.appendToTable(TestHelpers.Row.of(parreplacedion, 0), records);
} else {
appender.appendToTable(records);
}
}
private void insertRows(Table table, Row... rows) throws IOException {
insertRows(null, table, rows);
}
private void replacedertRows(List<Row> expectedRows, Iterator<Row> iterator) {
for (Row expectedRow : expectedRows) {
replacedert.replacedertTrue("Should have more records", iterator.hasNext());
Row actualRow = iterator.next();
replacedert.replacedertEquals("Should have expected fields", 3, actualRow.getArity());
replacedert.replacedertEquals("Should have expected id", expectedRow.getField(0), actualRow.getField(0));
replacedert.replacedertEquals("Should have expected data", expectedRow.getField(1), actualRow.getField(1));
replacedert.replacedertEquals("Should have expected dt", expectedRow.getField(2), actualRow.getField(2));
}
}
@Test
public void testUnParreplacedionedTable() throws Exception {
sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE);
Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE));
TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE);
try (CloseableIterator<Row> iterator = result.collect()) {
Row row1 = Row.of(1, "aaa", "2021-01-01");
insertRows(table, row1);
replacedertRows(ImmutableList.of(row1), iterator);
Row row2 = Row.of(2, "bbb", "2021-01-01");
insertRows(table, row2);
replacedertRows(ImmutableList.of(row2), iterator);
}
result.getJobClient().ifPresent(JobClient::cancel);
}
@Test
public void testParreplacedionedTable() throws Exception {
sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARreplacedIONED BY (dt)", TABLE);
Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE));
TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE);
try (CloseableIterator<Row> iterator = result.collect()) {
Row row1 = Row.of(1, "aaa", "2021-01-01");
insertRows("2021-01-01", table, row1);
replacedertRows(ImmutableList.of(row1), iterator);
Row row2 = Row.of(2, "bbb", "2021-01-02");
insertRows("2021-01-02", table, row2);
replacedertRows(ImmutableList.of(row2), iterator);
Row row3 = Row.of(1, "aaa", "2021-01-02");
insertRows("2021-01-02", table, row3);
replacedertRows(ImmutableList.of(row3), iterator);
Row row4 = Row.of(2, "bbb", "2021-01-01");
insertRows("2021-01-01", table, row4);
replacedertRows(ImmutableList.of(row4), iterator);
}
result.getJobClient().ifPresent(JobClient::cancel);
}
@Test
public void testConsumeFromBeginning() throws Exception {
sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE);
Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE));
Row row1 = Row.of(1, "aaa", "2021-01-01");
Row row2 = Row.of(2, "bbb", "2021-01-01");
insertRows(table, row1, row2);
TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE);
try (CloseableIterator<Row> iterator = result.collect()) {
replacedertRows(ImmutableList.of(row1, row2), iterator);
Row row3 = Row.of(3, "ccc", "2021-01-01");
insertRows(table, row3);
replacedertRows(ImmutableList.of(row3), iterator);
Row row4 = Row.of(4, "ddd", "2021-01-01");
insertRows(table, row4);
replacedertRows(ImmutableList.of(row4), iterator);
}
result.getJobClient().ifPresent(JobClient::cancel);
}
@Test
public void testConsumeFromStartSnapshotId() throws Exception {
sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE);
Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE));
// Produce two snapshots.
Row row1 = Row.of(1, "aaa", "2021-01-01");
Row row2 = Row.of(2, "bbb", "2021-01-01");
insertRows(table, row1);
insertRows(table, row2);
long startSnapshotId = table.currentSnapshot().snapshotId();
Row row3 = Row.of(3, "ccc", "2021-01-01");
Row row4 = Row.of(4, "ddd", "2021-01-01");
insertRows(table, row3, row4);
TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + "'start-snapshot-id'='%d')*/", TABLE, startSnapshotId);
try (CloseableIterator<Row> iterator = result.collect()) {
// The row2 in start snapshot will be excluded.
replacedertRows(ImmutableList.of(row3, row4), iterator);
Row row5 = Row.of(5, "eee", "2021-01-01");
Row row6 = Row.of(6, "fff", "2021-01-01");
insertRows(table, row5, row6);
replacedertRows(ImmutableList.of(row5, row6), iterator);
Row row7 = Row.of(7, "ggg", "2021-01-01");
insertRows(table, row7);
replacedertRows(ImmutableList.of(row7), iterator);
}
result.getJobClient().ifPresent(JobClient::cancel);
}
}
18
Source : TestDeltaTaskWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestDeltaTaskWriter extends TableTestBase {
private static final int FORMAT_V2 = 2;
private final FileFormat format;
@Parameterized.Parameters(name = "FileFormat = {0}")
public static Object[][] parameters() {
return new Object[][] { { "avro" }, { "parquet" } };
}
public TestDeltaTaskWriter(String fileFormat) {
super(FORMAT_V2);
this.format = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH));
}
@Before
public void setupTable() throws IOException {
this.tableDir = temp.newFolder();
// created by table create
replacedert.replacedertTrue(tableDir.delete());
this.metadataDir = new File(tableDir, "metadata");
}
private void initTable(boolean parreplacedioned) {
if (parreplacedioned) {
this.table = create(SCHEMA, ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build());
} else {
this.table = create(SCHEMA, ParreplacedionSpec.unparreplacedioned());
}
table.updateProperties().defaultFormat(format).commit();
}
private int idFieldId() {
return table.schema().findField("id").fieldId();
}
private int dataFieldId() {
return table.schema().findField("data").fieldId();
}
private void testCdcEvents(boolean parreplacedioned) throws IOException {
List<Integer> equalityFieldIds = Lists.newArrayList(idFieldId());
TaskWriterFactory<RowData> taskWriterFactory = createTaskWriterFactory(equalityFieldIds);
taskWriterFactory.initialize(1, 1);
// Start the 1th transaction.
TaskWriter<RowData> writer = taskWriterFactory.create();
writer.write(createInsert(1, "aaa"));
writer.write(createInsert(2, "bbb"));
writer.write(createInsert(3, "ccc"));
// Update <2, 'bbb'> to <2, 'ddd'>
// 1 pos-delete and 1 eq-delete.
writer.write(createUpdateBefore(2, "bbb"));
writer.write(createUpdateAfter(2, "ddd"));
// Update <1, 'aaa'> to <1, 'eee'>
// 1 pos-delete and 1 eq-delete.
writer.write(createUpdateBefore(1, "aaa"));
writer.write(createUpdateAfter(1, "eee"));
// Insert <4, 'fff'>
writer.write(createInsert(4, "fff"));
// Insert <5, 'ggg'>
writer.write(createInsert(5, "ggg"));
// Delete <3, 'ccc'>
// 1 pos-delete and 1 eq-delete.
writer.write(createDelete(3, "ccc"));
WriteResult result = writer.complete();
replacedert.replacedertEquals(parreplacedioned ? 7 : 1, result.dataFiles().length);
replacedert.replacedertEquals(parreplacedioned ? 6 : 2, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records.", expectedRowSet(createRecord(1, "eee"), createRecord(2, "ddd"), createRecord(4, "fff"), createRecord(5, "ggg")), actualRowSet("*"));
// Start the 2nd transaction.
writer = taskWriterFactory.create();
// Update <2, 'ddd'> to <6, 'hhh'> - (Update both key and value)
// 1 eq-delete
writer.write(createUpdateBefore(2, "ddd"));
writer.write(createUpdateAfter(6, "hhh"));
// Update <5, 'ggg'> to <5, 'iii'>
// 1 eq-delete
writer.write(createUpdateBefore(5, "ggg"));
writer.write(createUpdateAfter(5, "iii"));
// Delete <4, 'fff'>
// 1 eq-delete.
writer.write(createDelete(4, "fff"));
result = writer.complete();
replacedert.replacedertEquals(parreplacedioned ? 2 : 1, result.dataFiles().length);
replacedert.replacedertEquals(parreplacedioned ? 3 : 1, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh")), actualRowSet("*"));
}
@Test
public void testUnparreplacedioned() throws IOException {
initTable(false);
testCdcEvents(false);
}
@Test
public void testParreplacedioned() throws IOException {
initTable(true);
testCdcEvents(true);
}
private void testWritePureEqDeletes(boolean parreplacedioned) throws IOException {
initTable(parreplacedioned);
List<Integer> equalityFieldIds = Lists.newArrayList(idFieldId());
TaskWriterFactory<RowData> taskWriterFactory = createTaskWriterFactory(equalityFieldIds);
taskWriterFactory.initialize(1, 1);
TaskWriter<RowData> writer = taskWriterFactory.create();
writer.write(createDelete(1, "aaa"));
writer.write(createDelete(2, "bbb"));
writer.write(createDelete(3, "ccc"));
WriteResult result = writer.complete();
replacedert.replacedertEquals(0, result.dataFiles().length);
replacedert.replacedertEquals(parreplacedioned ? 3 : 1, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have no record", expectedRowSet(), actualRowSet("*"));
}
@Test
public void testUnparreplacedionedPureEqDeletes() throws IOException {
testWritePureEqDeletes(false);
}
@Test
public void testParreplacedionedPureEqDeletes() throws IOException {
testWritePureEqDeletes(true);
}
private void testAbort(boolean parreplacedioned) throws IOException {
initTable(parreplacedioned);
List<Integer> equalityFieldIds = Lists.newArrayList(idFieldId());
TaskWriterFactory<RowData> taskWriterFactory = createTaskWriterFactory(equalityFieldIds);
taskWriterFactory.initialize(1, 1);
TaskWriter<RowData> writer = taskWriterFactory.create();
writer.write(createUpdateBefore(1, "aaa"));
writer.write(createUpdateAfter(1, "bbb"));
writer.write(createUpdateBefore(2, "aaa"));
writer.write(createUpdateAfter(2, "bbb"));
// replacedert the current data/delete file count.
List<Path> files = Files.walk(Paths.get(tableDir.getPath(), "data")).filter(p -> p.toFile().isFile()).filter(p -> !p.toString().endsWith(".crc")).collect(Collectors.toList());
replacedert.replacedertEquals("Should have expected file count, but files are: " + files, parreplacedioned ? 4 : 2, files.size());
writer.abort();
for (Path file : files) {
replacedert.replacedertFalse(Files.exists(file));
}
}
@Test
public void testUnparreplacedionedAbort() throws IOException {
testAbort(false);
}
@Test
public void testParreplacedionedAbort() throws IOException {
testAbort(true);
}
@Test
public void testParreplacedionedTableWithDataAsKey() throws IOException {
initTable(true);
List<Integer> equalityFieldIds = Lists.newArrayList(dataFieldId());
TaskWriterFactory<RowData> taskWriterFactory = createTaskWriterFactory(equalityFieldIds);
taskWriterFactory.initialize(1, 1);
// Start the 1th transaction.
TaskWriter<RowData> writer = taskWriterFactory.create();
writer.write(createInsert(1, "aaa"));
writer.write(createInsert(2, "aaa"));
writer.write(createInsert(3, "bbb"));
writer.write(createInsert(4, "ccc"));
WriteResult result = writer.complete();
replacedert.replacedertEquals(3, result.dataFiles().length);
replacedert.replacedertEquals(1, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc")), actualRowSet("*"));
// Start the 2nd transaction.
writer = taskWriterFactory.create();
writer.write(createInsert(5, "aaa"));
writer.write(createInsert(6, "bbb"));
// 1 eq-delete.
writer.write(createDelete(7, "ccc"));
result = writer.complete();
replacedert.replacedertEquals(2, result.dataFiles().length);
replacedert.replacedertEquals(1, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(createRecord(2, "aaa"), createRecord(5, "aaa"), createRecord(3, "bbb"), createRecord(6, "bbb")), actualRowSet("*"));
}
@Test
public void testParreplacedionedTableWithDataAndIdAsKey() throws IOException {
initTable(true);
List<Integer> equalityFieldIds = Lists.newArrayList(dataFieldId(), idFieldId());
TaskWriterFactory<RowData> taskWriterFactory = createTaskWriterFactory(equalityFieldIds);
taskWriterFactory.initialize(1, 1);
TaskWriter<RowData> writer = taskWriterFactory.create();
writer.write(createInsert(1, "aaa"));
writer.write(createInsert(2, "aaa"));
// 1 pos-delete and 1 eq-delete.
writer.write(createDelete(2, "aaa"));
WriteResult result = writer.complete();
replacedert.replacedertEquals(1, result.dataFiles().length);
replacedert.replacedertEquals(2, result.deleteFiles().length);
replacedert.replacedertEquals(Sets.newHashSet(FileContent.EQUALITY_DELETES, FileContent.POSITION_DELETES), Sets.newHashSet(result.deleteFiles()[0].content(), result.deleteFiles()[1].content()));
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(createRecord(1, "aaa")), actualRowSet("*"));
}
private void commitTransaction(WriteResult result) {
RowDelta rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
rowDelta.validateDeletedFiles().validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())).commit();
}
private StructLikeSet expectedRowSet(Record... records) {
return SimpleDataUtil.expectedRowSet(table, records);
}
private StructLikeSet actualRowSet(String... columns) throws IOException {
return SimpleDataUtil.actualRowSet(table, columns);
}
private TaskWriterFactory<RowData> createTaskWriterFactory(List<Integer> equalityFieldIds) {
return new RowDataTaskWriterFactory(table.schema(), FlinkSchemaUtil.convert(table.schema()), table.spec(), table.locationProvider(), table.io(), table.encryption(), 128 * 1024 * 1024, format, table.properties(), equalityFieldIds);
}
}
18
Source : FlinkAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public DataWriter<RowData> newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike parreplacedion) {
return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, file.encryptingOutputFile().location(), spec, parreplacedion, file.keyMetadata());
}
18
Source : GenericAppenderHelper.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
/**
* Helper for appending {@link DataFile} to a table or appending {@link Record}s to a table.
*/
public clreplaced GenericAppenderHelper {
private final Table table;
private final FileFormat fileFormat;
private final TemporaryFolder tmp;
public GenericAppenderHelper(Table table, FileFormat fileFormat, TemporaryFolder tmp) {
this.table = table;
this.fileFormat = fileFormat;
this.tmp = tmp;
}
public void appendToTable(DataFile... dataFiles) {
Preconditions.checkNotNull(table, "table not set");
AppendFiles append = table.newAppend();
for (DataFile dataFile : dataFiles) {
append = append.appendFile(dataFile);
}
append.commit();
}
public void appendToTable(List<Record> records) throws IOException {
appendToTable(null, records);
}
public void appendToTable(StructLike parreplacedion, List<Record> records) throws IOException {
appendToTable(writeFile(parreplacedion, records));
}
public DataFile writeFile(StructLike parreplacedion, List<Record> records) throws IOException {
Preconditions.checkNotNull(table, "table not set");
File file = tmp.newFile();
replacedert.replacedertTrue(file.delete());
return appendToLocalFile(table, file, fileFormat, parreplacedion, records);
}
private static DataFile appendToLocalFile(Table table, File file, FileFormat format, StructLike parreplacedion, List<Record> records) throws IOException {
FileAppender<Record> appender = new GenericAppenderFactory(table.schema()).newAppender(Files.localOutput(file), format);
try (FileAppender<Record> fileAppender = appender) {
fileAppender.addAll(records);
}
return DataFiles.builder(table.spec()).withRecordCount(records.size()).withFileSizeInBytes(file.length()).withPath(Files.localInput(file).location()).withMetrics(appender.metrics()).withFormat(format).withParreplacedion(parreplacedion).build();
}
}
18
Source : GenericAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public org.apache.iceberg.io.DataWriter<Record> newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike parreplacedion) {
return new org.apache.iceberg.io.DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, file.encryptingOutputFile().location(), spec, parreplacedion, file.keyMetadata());
}
17
Source : IcebergSplit.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public clreplaced IcebergSplit implements ConnectorSplit {
private final String path;
private final long start;
private final long length;
private final long fileSize;
private final FileFormat fileFormat;
private final List<HostAddress> addresses;
private final Map<Integer, String> parreplacedionKeys;
@JsonCreator
public IcebergSplit(@JsonProperty("path") String path, @JsonProperty("start") long start, @JsonProperty("length") long length, @JsonProperty("fileSize") long fileSize, @JsonProperty("fileFormat") FileFormat fileFormat, @JsonProperty("addresses") List<HostAddress> addresses, @JsonProperty("parreplacedionKeys") Map<Integer, String> parreplacedionKeys) {
this.path = requireNonNull(path, "path is null");
this.start = start;
this.length = length;
this.fileSize = fileSize;
this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
this.addresses = ImmutableList.copyOf(requireNonNull(addresses, "addresses is null"));
this.parreplacedionKeys = Collections.unmodifiableMap(requireNonNull(parreplacedionKeys, "parreplacedionKeys is null"));
}
@Override
public boolean isRemotelyAccessible() {
return true;
}
@JsonProperty
@Override
public List<HostAddress> getAddresses() {
return addresses;
}
@JsonProperty
public String getPath() {
return path;
}
@JsonProperty
public long getStart() {
return start;
}
@JsonProperty
public long getLength() {
return length;
}
@JsonProperty
public long getFileSize() {
return fileSize;
}
@JsonProperty
public FileFormat getFileFormat() {
return fileFormat;
}
@JsonProperty
public Map<Integer, String> getParreplacedionKeys() {
return parreplacedionKeys;
}
@Override
public Object getInfo() {
return ImmutableMap.builder().put("path", path).put("start", start).put("length", length).build();
}
@Override
public String toString() {
return toStringHelper(this).addValue(path).addValue(start).addValue(length).toString();
}
}
17
Source : IcebergFileWriterFactory.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public IcebergFileWriter createFileWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session, HdfsContext hdfsContext, FileFormat fileFormat) {
switch(fileFormat) {
case PARQUET:
return createParquetWriter(outputPath, icebergSchema, jobConf, session, hdfsContext);
case ORC:
return createOrcWriter(outputPath, icebergSchema, jobConf, session);
default:
throw new TrinoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat);
}
}
17
Source : SparkAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public EqualityDeleteWriter<InternalRow> newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, StructLike parreplacedion) {
Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer");
Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer");
try {
switch(format) {
case PARQUET:
return Parquet.writeDeletes(file.encryptingOutputFile()).createWriterFunc(msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)).overwrite().rowSchema(eqDeleteRowSchema).withSpec(spec).withParreplacedion(parreplacedion).equalityFieldIds(equalityFieldIds).withKeyMetadata(file.keyMetadata()).buildEqualityWriter();
case AVRO:
return Avro.writeDeletes(file.encryptingOutputFile()).createWriterFunc(ignored -> new SparkAvroWriter(lazyEqDeleteSparkType())).overwrite().rowSchema(eqDeleteRowSchema).withSpec(spec).withParreplacedion(parreplacedion).equalityFieldIds(equalityFieldIds).withKeyMetadata(file.keyMetadata()).buildEqualityWriter();
default:
throw new UnsupportedOperationException("Cannot write equality-deletes for unsupported file format: " + format);
}
} catch (IOException e) {
throw new UncheckedIOException("Failed to create new equality delete writer", e);
}
}
17
Source : TestTables.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
/**
* Creates a Hive test table. Creates the Iceberg table/data and creates the corresponding Hive table as well when
* needed. The table will be in the 'default' database. The table will be populated with the provided with randomly
* generated {@link Record}s.
* @param shell The HiveShell used for Hive table creation
* @param tableName The name of the test table
* @param schema The schema used for the table creation
* @param fileFormat The file format used for writing the data
* @param numRecords The number of records should be generated and stored in the table
* @throws IOException If there is an error writing data
*/
public List<Record> createTableWithGeneratedRecords(TestHiveShell shell, String tableName, Schema schema, FileFormat fileFormat, int numRecords) throws IOException {
List<Record> records = TestHelper.generateRandomRecords(schema, numRecords, 0L);
createTable(shell, tableName, schema, fileFormat, records);
return records;
}
17
Source : TestTables.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
/**
* Creates an non parreplacedioned Hive test table. Creates the Iceberg table/data and creates the corresponding Hive
* table as well when needed. The table will be in the 'default' database. The table will be populated with the
* provided List of {@link Record}s.
* @param shell The HiveShell used for Hive table creation
* @param tableName The name of the test table
* @param schema The schema used for the table creation
* @param fileFormat The file format used for writing the data
* @param records The records with which the table is populated
* @return The created table
* @throws IOException If there is an error writing data
*/
public Table createTable(TestHiveShell shell, String tableName, Schema schema, FileFormat fileFormat, List<Record> records) throws IOException {
Table table = createIcebergTable(shell.getHiveConf(), tableName, schema, fileFormat, records);
String createHiveSQL = createHiveTableSQL(TableIdentifier.of("default", tableName), ImmutableMap.of());
if (createHiveSQL != null) {
shell.executeStatement(createHiveSQL);
}
return table;
}
17
Source : TestTables.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
/**
* Creates a parreplacedioned Hive test table using Hive SQL. The table will be in the 'default' database.
* The table will be populated with the provided List of {@link Record}s using a Hive insert statement.
* @param shell The HiveShell used for Hive table creation
* @param tableName The name of the test table
* @param schema The schema used for the table creation
* @param spec The parreplacedion specification for the table
* @param fileFormat The file format used for writing the data
* @param records The records with which the table is populated
* @return The created table
* @throws IOException If there is an error writing data
*/
public Table createTable(TestHiveShell shell, String tableName, Schema schema, ParreplacedionSpec spec, FileFormat fileFormat, List<Record> records) {
TableIdentifier identifier = TableIdentifier.of("default", tableName);
shell.executeStatement("CREATE EXTERNAL TABLE " + identifier + " STORED BY '" + HiveIcebergStorageHandler.clreplaced.getName() + "' " + locationForCreateTableSQL(identifier) + "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(schema) + "', " + "'" + InputFormatConfig.PARreplacedION_SPEC + "'='" + ParreplacedionSpecParser.toJson(spec) + "', " + "'" + TableProperties.DEFAULT_FILE_FORMAT + "'='" + fileFormat + "')");
if (records != null && !records.isEmpty()) {
StringBuilder query = new StringBuilder().append("INSERT INTO " + identifier + " VALUES ");
records.forEach(record -> {
query.append("(");
query.append(record.struct().fields().stream().map(field -> getStringValueForInsert(record.getField(field.name()), field.type())).collect(Collectors.joining(",")));
query.append("),");
});
query.setLength(query.length() - 1);
shell.executeStatement(query.toString());
}
return loadTable(identifier);
}
17
Source : HiveIcebergStorageHandlerTestUtils.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public clreplaced HiveIcebergStorageHandlerTestUtils {
static final FileFormat[] FILE_FORMATS = new FileFormat[] { FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET };
static final Schema CUSTOMER_SCHEMA = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "first_name", Types.StringType.get(), "This is first name"), optional(3, "last_name", Types.StringType.get(), "This is last name"));
static final Schema CUSTOMER_SCHEMA_WITH_UPPERCASE = new Schema(optional(1, "CustomER_Id", Types.LongType.get()), optional(2, "First_name", Types.StringType.get()), optional(3, "Last_name", Types.StringType.get()));
static final List<Record> CUSTOMER_RECORDS = TestHelper.RecordsBuilder.newInstance(CUSTOMER_SCHEMA).add(0L, "Alice", "Brown").add(1L, "Bob", "Green").add(2L, "Trudy", "Pink").build();
private HiveIcebergStorageHandlerTestUtils() {
// Empty constructor for the utility clreplaced
}
static TestHiveShell shell() {
TestHiveShell shell = new TestHiveShell();
shell.setHiveConfValue("hive.notification.event.poll.interval", "-1");
shell.setHiveConfValue("hive.tez.exec.print.summary", "true");
// We would like to make sure that ORC reading overrides this config, so reading Iceberg tables could work in
// systems (like Hive 3.2 and higher) where this value is set to true explicitly.
shell.setHiveConfValue(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), "true");
shell.start();
return shell;
}
static TestTables testTables(TestHiveShell shell, TestTables.TestTableType testTableType, TemporaryFolder temp) throws IOException {
return testTableType.instance(shell.metastore().hiveConf(), temp);
}
static void init(TestHiveShell shell, TestTables testTables, TemporaryFolder temp, String engine) {
shell.openSession();
for (Map.Entry<String, String> property : testTables.properties().entrySet()) {
shell.setHiveSessionValue(property.getKey(), property.getValue());
}
shell.setHiveSessionValue("hive.execution.engine", engine);
shell.setHiveSessionValue("hive.jar.directory", temp.getRoot().getAbsolutePath());
shell.setHiveSessionValue("tez.staging-dir", temp.getRoot().getAbsolutePath());
// temporarily disabling vectorization in Tez, since it doesn't work with projection pruning (fix: TEZ-4248)
// TODO: remove this once TEZ-4248 has been released and the Tez dependencies updated here
if (engine.equals("tez")) {
shell.setHiveSessionValue("hive.vectorized.execution.enabled", "false");
}
}
static void close(TestHiveShell shell) throws Exception {
shell.closeSession();
shell.metastore().reset();
// HiveServer2 thread pools are using thread local Hive -> HMSClient objects. These are not cleaned up when the
// HiveServer2 is stopped. Only Finalizer closes the HMS connections.
System.gc();
}
}
17
Source : TestFlinkTableSink.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}")
public static Iterable<Object[]> parameters() {
List<Object[]> parameters = Lists.newArrayList();
for (FileFormat format : new FileFormat[] { FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET }) {
for (Boolean isStreaming : new Boolean[] { true, false }) {
for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) {
String catalogName = (String) catalogParams[0];
Namespace baseNamespace = (Namespace) catalogParams[1];
parameters.add(new Object[] { catalogName, baseNamespace, format, isStreaming });
}
}
}
return parameters;
}
17
Source : TestStreamingReaderOperator.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestStreamingReaderOperator extends TableTestBase {
private static final Schema SCHEMA = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get()), Types.NestedField.required(2, "data", Types.StringType.get()));
private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET;
@Parameterized.Parameters(name = "FormatVersion={0}")
public static Iterable<Object[]> parameters() {
return ImmutableList.of(new Object[] { 1 }, new Object[] { 2 });
}
public TestStreamingReaderOperator(int formatVersion) {
super(formatVersion);
}
@Before
@Override
public void setupTable() throws IOException {
this.tableDir = temp.newFolder();
this.metadataDir = new File(tableDir, "metadata");
replacedert.replacedertTrue(tableDir.delete());
// Construct the iceberg table.
table = create(SCHEMA, ParreplacedionSpec.unparreplacedioned());
}
@Test
public void testProcessAllRecords() throws Exception {
List<List<Record>> expectedRecords = generateRecordsAndCommitTxn(10);
List<FlinkInputSplit> splits = generateSplits();
replacedert.replacedertEquals("Should have 10 splits", 10, splits.size());
try (OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> harness = createReader()) {
harness.setup();
harness.open();
SteppingMailboxProcessor processor = createLocalMailbox(harness);
List<Record> expected = Lists.newArrayList();
for (int i = 0; i < splits.size(); i++) {
// Process this element to enqueue to mail-box.
harness.processElement(splits.get(i), -1);
// Run the mail-box once to read all records from the given split.
replacedert.replacedertTrue("Should processed 1 split", processor.runMailboxStep());
// replacedert the output has expected elements.
expected.addAll(expectedRecords.get(i));
TestHelpers.replacedertRecords(readOutputValues(harness), expected, SCHEMA);
}
}
}
@Test
public void testTriggerCheckpoint() throws Exception {
// Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading records from
// split1.
List<List<Record>> expectedRecords = generateRecordsAndCommitTxn(3);
List<FlinkInputSplit> splits = generateSplits();
replacedert.replacedertEquals("Should have 3 splits", 3, splits.size());
long timestamp = 0;
try (OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> harness = createReader()) {
harness.setup();
harness.open();
SteppingMailboxProcessor processor = createLocalMailbox(harness);
harness.processElement(splits.get(0), ++timestamp);
harness.processElement(splits.get(1), ++timestamp);
harness.processElement(splits.get(2), ++timestamp);
// Trigger snapshot state, it will start to work once all records from split0 are read.
processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot");
replacedert.replacedertTrue("Should have processed the split0", processor.runMailboxStep());
replacedert.replacedertTrue("Should have processed the snapshot state action", processor.runMailboxStep());
TestHelpers.replacedertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA);
// Read records from split1.
replacedert.replacedertTrue("Should have processed the split1", processor.runMailboxStep());
// Read records from split2.
replacedert.replacedertTrue("Should have processed the split2", processor.runMailboxStep());
TestHelpers.replacedertRecords(readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA);
}
}
@Test
public void testCheckpointRestore() throws Exception {
List<List<Record>> expectedRecords = generateRecordsAndCommitTxn(15);
List<FlinkInputSplit> splits = generateSplits();
replacedert.replacedertEquals("Should have 10 splits", 15, splits.size());
OperatorSubtaskState state;
List<Record> expected = Lists.newArrayList();
try (OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> harness = createReader()) {
harness.setup();
harness.open();
// Enqueue all the splits.
for (FlinkInputSplit split : splits) {
harness.processElement(split, -1);
}
// Read all records from the first five splits.
SteppingMailboxProcessor localMailbox = createLocalMailbox(harness);
for (int i = 0; i < 5; i++) {
expected.addAll(expectedRecords.get(i));
replacedert.replacedertTrue("Should have processed the split#" + i, localMailbox.runMailboxStep());
TestHelpers.replacedertRecords(readOutputValues(harness), expected, SCHEMA);
}
// Snapshot state now, there're 10 splits left in the state.
state = harness.snapshot(1, 1);
}
expected.clear();
try (OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> harness = createReader()) {
harness.setup();
// Recover to process the remaining splits.
harness.initializeState(state);
harness.open();
SteppingMailboxProcessor localMailbox = createLocalMailbox(harness);
for (int i = 5; i < 10; i++) {
expected.addAll(expectedRecords.get(i));
replacedert.replacedertTrue("Should have processed one split#" + i, localMailbox.runMailboxStep());
TestHelpers.replacedertRecords(readOutputValues(harness), expected, SCHEMA);
}
// Let's process the final 5 splits now.
for (int i = 10; i < 15; i++) {
expected.addAll(expectedRecords.get(i));
harness.processElement(splits.get(i), 1);
replacedert.replacedertTrue("Should have processed the split#" + i, localMailbox.runMailboxStep());
TestHelpers.replacedertRecords(readOutputValues(harness), expected, SCHEMA);
}
}
}
private List<Row> readOutputValues(OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> harness) {
List<Row> results = Lists.newArrayList();
for (RowData rowData : harness.extractOutputValues()) {
results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString()));
}
return results;
}
private List<List<Record>> generateRecordsAndCommitTxn(int commitTimes) throws IOException {
List<List<Record>> expectedRecords = Lists.newArrayList();
for (int i = 0; i < commitTimes; i++) {
List<Record> records = RandomGenericData.generate(SCHEMA, 100, 0L);
expectedRecords.add(records);
// Commit those records to iceberg table.
writeRecords(records);
}
return expectedRecords;
}
private void writeRecords(List<Record> records) throws IOException {
GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp);
appender.appendToTable(records);
}
private List<FlinkInputSplit> generateSplits() {
List<FlinkInputSplit> inputSplits = Lists.newArrayList();
List<Long> snapshotIds = SnapshotUtil.currentAncestors(table);
for (int i = snapshotIds.size() - 1; i >= 0; i--) {
ScanContext scanContext;
if (i == snapshotIds.size() - 1) {
// Generate the splits from the first snapshot.
scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build();
} else {
// Generate the splits between the previous snapshot and current snapshot.
scanContext = ScanContext.builder().startSnapshotId(snapshotIds.get(i + 1)).endSnapshotId(snapshotIds.get(i)).build();
}
Collections.addAll(inputSplits, FlinkSplitGenerator.createInputSplits(table, scanContext));
}
return inputSplits;
}
private OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> createReader() throws Exception {
// This input format is used to opening the emitted split.
FlinkInputFormat inputFormat = FlinkSource.forRowData().tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())).buildFormat();
OneInputStreamOperatorFactory<FlinkInputSplit, RowData> factory = StreamingReaderOperator.factory(inputFormat);
OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> harness = new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0);
harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime);
return harness;
}
private SteppingMailboxProcessor createLocalMailbox(OneInputStreamOperatorTestHarness<FlinkInputSplit, RowData> harness) {
return new SteppingMailboxProcessor(MailboxDefaultAction.Controller::suspendDefaultAction, harness.getTaskMailbox(), StreamTaskActionExecutor.IMMEDIATE);
}
}
17
Source : TestStreamingMonitorFunction.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestStreamingMonitorFunction extends TableTestBase {
private static final Schema SCHEMA = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get()), Types.NestedField.required(2, "data", Types.StringType.get()));
private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET;
private static final long WAIT_TIME_MILLIS = 10 * 1000L;
@Parameterized.Parameters(name = "FormatVersion={0}")
public static Iterable<Object[]> parameters() {
return ImmutableList.of(new Object[] { 1 }, new Object[] { 2 });
}
public TestStreamingMonitorFunction(int formatVersion) {
super(formatVersion);
}
@Before
@Override
public void setupTable() throws IOException {
this.tableDir = temp.newFolder();
this.metadataDir = new File(tableDir, "metadata");
replacedert.replacedertTrue(tableDir.delete());
// Construct the iceberg table.
table = create(SCHEMA, ParreplacedionSpec.unparreplacedioned());
}
private void runSourceFunctionInTask(TestSourceContext sourceContext, StreamingMonitorFunction function) {
Thread task = new Thread(() -> {
try {
function.run(sourceContext);
} catch (Exception e) {
throw new RuntimeException(e);
}
});
task.start();
}
@Test
public void testConsumeWithoutStartSnapshotId() throws Exception {
List<List<Record>> recordsList = generateRecordsAndCommitTxn(10);
ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build();
StreamingMonitorFunction function = createFunction(scanContext);
try (AbstractStreamOperatorTestHarness<FlinkInputSplit> harness = createHarness(function)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(1);
TestSourceContext sourceContext = new TestSourceContext(latch);
runSourceFunctionInTask(sourceContext, function);
replacedert.replacedertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS));
Thread.sleep(1000L);
// Stop the stream task.
function.close();
replacedert.replacedertEquals("Should produce the expected splits", 1, sourceContext.splits.size());
TestHelpers.replacedertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA);
}
}
@Test
public void testConsumeFromStartSnapshotId() throws Exception {
// Commit the first five transactions.
generateRecordsAndCommitTxn(5);
long startSnapshotId = table.currentSnapshot().snapshotId();
// Commit the next five transactions.
List<List<Record>> recordsList = generateRecordsAndCommitTxn(5);
ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).startSnapshotId(startSnapshotId).build();
StreamingMonitorFunction function = createFunction(scanContext);
try (AbstractStreamOperatorTestHarness<FlinkInputSplit> harness = createHarness(function)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(1);
TestSourceContext sourceContext = new TestSourceContext(latch);
runSourceFunctionInTask(sourceContext, function);
replacedert.replacedertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS));
Thread.sleep(1000L);
// Stop the stream task.
function.close();
replacedert.replacedertEquals("Should produce the expected splits", 1, sourceContext.splits.size());
TestHelpers.replacedertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA);
}
}
@Test
public void testCheckpointRestore() throws Exception {
List<List<Record>> recordsList = generateRecordsAndCommitTxn(10);
ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build();
StreamingMonitorFunction func = createFunction(scanContext);
OperatorSubtaskState state;
try (AbstractStreamOperatorTestHarness<FlinkInputSplit> harness = createHarness(func)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(1);
TestSourceContext sourceContext = new TestSourceContext(latch);
runSourceFunctionInTask(sourceContext, func);
replacedert.replacedertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS));
Thread.sleep(1000L);
state = harness.snapshot(1, 1);
// Stop the stream task.
func.close();
replacedert.replacedertEquals("Should produce the expected splits", 1, sourceContext.splits.size());
TestHelpers.replacedertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA);
}
List<List<Record>> newRecordsList = generateRecordsAndCommitTxn(10);
StreamingMonitorFunction newFunc = createFunction(scanContext);
try (AbstractStreamOperatorTestHarness<FlinkInputSplit> harness = createHarness(newFunc)) {
harness.setup();
// Recover to process the remaining snapshots.
harness.initializeState(state);
harness.open();
CountDownLatch latch = new CountDownLatch(1);
TestSourceContext sourceContext = new TestSourceContext(latch);
runSourceFunctionInTask(sourceContext, newFunc);
replacedert.replacedertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS));
Thread.sleep(1000L);
// Stop the stream task.
newFunc.close();
replacedert.replacedertEquals("Should produce the expected splits", 1, sourceContext.splits.size());
TestHelpers.replacedertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA);
}
}
private List<List<Record>> generateRecordsAndCommitTxn(int commitTimes) throws IOException {
List<List<Record>> expectedRecords = Lists.newArrayList();
for (int i = 0; i < commitTimes; i++) {
List<Record> records = RandomGenericData.generate(SCHEMA, 100, 0L);
expectedRecords.add(records);
// Commit those records to iceberg table.
writeRecords(records);
}
return expectedRecords;
}
private void writeRecords(List<Record> records) throws IOException {
GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp);
appender.appendToTable(records);
}
private StreamingMonitorFunction createFunction(ScanContext scanContext) {
return new StreamingMonitorFunction(TestTableLoader.of(tableDir.getAbsolutePath()), scanContext);
}
private AbstractStreamOperatorTestHarness<FlinkInputSplit> createHarness(StreamingMonitorFunction function) throws Exception {
StreamSource<FlinkInputSplit, StreamingMonitorFunction> streamSource = new StreamSource<>(function);
return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0);
}
private clreplaced TestSourceContext implements SourceFunction.SourceContext<FlinkInputSplit> {
private final List<FlinkInputSplit> splits = Lists.newArrayList();
private final Object checkpointLock = new Object();
private final CountDownLatch latch;
TestSourceContext(CountDownLatch latch) {
this.latch = latch;
}
@Override
public void collect(FlinkInputSplit element) {
splits.add(element);
latch.countDown();
}
@Override
public void collectWithTimestamp(FlinkInputSplit element, long timestamp) {
collect(element);
}
@Override
public void emireplacedermark(Watermark mark) {
}
@Override
public void markAsTemporarilyIdle() {
}
@Override
public Object getCheckpointLock() {
return checkpointLock;
}
@Override
public void close() {
}
private List<Row> toRows() throws IOException {
FlinkInputFormat format = FlinkSource.forRowData().tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())).buildFormat();
List<Row> rows = Lists.newArrayList();
for (FlinkInputSplit split : splits) {
format.open(split);
RowData element = null;
try {
while (!format.reachedEnd()) {
element = format.nextRecord(element);
rows.add(Row.of(element.getInt(0), element.getString(1).toString()));
}
} finally {
format.close();
}
}
return rows;
}
}
}
17
Source : TestIcebergFilesCommitter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestIcebergFilesCommitter extends TableTestBase {
private static final Configuration CONF = new Configuration();
private String tablePath;
private File flinkManifestFolder;
private final FileFormat format;
@Parameterized.Parameters(name = "FileFormat = {0}, FormatVersion={1}")
public static Object[][] parameters() {
return new Object[][] { new Object[] { "avro", 1 }, new Object[] { "avro", 2 }, new Object[] { "parquet", 1 }, new Object[] { "parquet", 2 }, new Object[] { "orc", 1 } };
}
public TestIcebergFilesCommitter(String format, int formatVersion) {
super(formatVersion);
this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
}
@Before
public void setupTable() throws IOException {
flinkManifestFolder = temp.newFolder();
this.tableDir = temp.newFolder();
this.metadataDir = new File(tableDir, "metadata");
replacedert.replacedertTrue(tableDir.delete());
tablePath = tableDir.getAbsolutePath();
// Construct the iceberg table.
table = create(SimpleDataUtil.SCHEMA, ParreplacedionSpec.unparreplacedioned());
table.updateProperties().set(DEFAULT_FILE_FORMAT, format.name()).set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()).commit();
}
@Test
public void testCommitTxnWithoutDataFiles() throws Exception {
long checkpointId = 0;
long timestamp = 0;
JobID jobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
SimpleDataUtil.replacedertTableRows(table, Lists.newArrayList());
replacedertSnapshotSize(0);
replacedertMaxCommittedCheckpointId(jobId, -1L);
// It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the future flink job
// failover won't fail.
for (int i = 1; i <= 3; i++) {
harness.snapshot(++checkpointId, ++timestamp);
replacedertFlinkManifests(0);
harness.notifyOfCompletedCheckpoint(checkpointId);
replacedertFlinkManifests(0);
replacedertSnapshotSize(i);
replacedertMaxCommittedCheckpointId(jobId, checkpointId);
}
}
}
private WriteResult of(DataFile dataFile) {
return WriteResult.builder().addDataFiles(dataFile).build();
}
@Test
public void testCommitTxn() throws Exception {
// Test with 3 continues checkpoints:
// 1. snapshotState for checkpoint#1
// 2. notifyCheckpointComplete for checkpoint#1
// 3. snapshotState for checkpoint#2
// 4. notifyCheckpointComplete for checkpoint#2
// 5. snapshotState for checkpoint#3
// 6. notifyCheckpointComplete for checkpoint#3
long timestamp = 0;
JobID jobID = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobID)) {
harness.setup();
harness.open();
replacedertSnapshotSize(0);
List<RowData> rows = Lists.newArrayListWithExpectedSize(3);
for (int i = 1; i <= 3; i++) {
RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i);
DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData));
harness.processElement(of(dataFile), ++timestamp);
rows.add(rowData);
harness.snapshot(i, ++timestamp);
replacedertFlinkManifests(1);
harness.notifyOfCompletedCheckpoint(i);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.copyOf(rows));
replacedertSnapshotSize(i);
replacedertMaxCommittedCheckpointId(jobID, i);
}
}
}
@Test
public void testOrderedEventsBetweenCheckpoints() throws Exception {
// It's possible that two checkpoints happen in the following orders:
// 1. snapshotState for checkpoint#1;
// 2. snapshotState for checkpoint#2;
// 3. notifyCheckpointComplete for checkpoint#1;
// 4. notifyCheckpointComplete for checkpoint#2;
long timestamp = 0;
JobID jobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertMaxCommittedCheckpointId(jobId, -1L);
RowData row1 = SimpleDataUtil.createRowData(1, "hello");
DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1));
harness.processElement(of(dataFile1), ++timestamp);
replacedertMaxCommittedCheckpointId(jobId, -1L);
// 1. snapshotState for checkpoint#1
long firstCheckpointId = 1;
harness.snapshot(firstCheckpointId, ++timestamp);
replacedertFlinkManifests(1);
RowData row2 = SimpleDataUtil.createRowData(2, "world");
DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2));
harness.processElement(of(dataFile2), ++timestamp);
replacedertMaxCommittedCheckpointId(jobId, -1L);
// 2. snapshotState for checkpoint#2
long secondCheckpointId = 2;
harness.snapshot(secondCheckpointId, ++timestamp);
replacedertFlinkManifests(2);
// 3. notifyCheckpointComplete for checkpoint#1
harness.notifyOfCompletedCheckpoint(firstCheckpointId);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row1));
replacedertMaxCommittedCheckpointId(jobId, firstCheckpointId);
replacedertFlinkManifests(1);
// 4. notifyCheckpointComplete for checkpoint#2
harness.notifyOfCompletedCheckpoint(secondCheckpointId);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row1, row2));
replacedertMaxCommittedCheckpointId(jobId, secondCheckpointId);
replacedertFlinkManifests(0);
}
}
@Test
public void testDisorderedEventsBetweenCheckpoints() throws Exception {
// It's possible that the two checkpoints happen in the following orders:
// 1. snapshotState for checkpoint#1;
// 2. snapshotState for checkpoint#2;
// 3. notifyCheckpointComplete for checkpoint#2;
// 4. notifyCheckpointComplete for checkpoint#1;
long timestamp = 0;
JobID jobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertMaxCommittedCheckpointId(jobId, -1L);
RowData row1 = SimpleDataUtil.createRowData(1, "hello");
DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1));
harness.processElement(of(dataFile1), ++timestamp);
replacedertMaxCommittedCheckpointId(jobId, -1L);
// 1. snapshotState for checkpoint#1
long firstCheckpointId = 1;
harness.snapshot(firstCheckpointId, ++timestamp);
replacedertFlinkManifests(1);
RowData row2 = SimpleDataUtil.createRowData(2, "world");
DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2));
harness.processElement(of(dataFile2), ++timestamp);
replacedertMaxCommittedCheckpointId(jobId, -1L);
// 2. snapshotState for checkpoint#2
long secondCheckpointId = 2;
harness.snapshot(secondCheckpointId, ++timestamp);
replacedertFlinkManifests(2);
// 3. notifyCheckpointComplete for checkpoint#2
harness.notifyOfCompletedCheckpoint(secondCheckpointId);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row1, row2));
replacedertMaxCommittedCheckpointId(jobId, secondCheckpointId);
replacedertFlinkManifests(0);
// 4. notifyCheckpointComplete for checkpoint#1
harness.notifyOfCompletedCheckpoint(firstCheckpointId);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row1, row2));
replacedertMaxCommittedCheckpointId(jobId, secondCheckpointId);
replacedertFlinkManifests(0);
}
}
@Test
public void testRecoveryFromValidSnapshot() throws Exception {
long checkpointId = 0;
long timestamp = 0;
List<RowData> expectedRows = Lists.newArrayList();
OperatorSubtaskState snapshot;
JobID jobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertSnapshotSize(0);
replacedertMaxCommittedCheckpointId(jobId, -1L);
RowData row = SimpleDataUtil.createRowData(1, "hello");
expectedRows.add(row);
DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row));
harness.processElement(of(dataFile1), ++timestamp);
snapshot = harness.snapshot(++checkpointId, ++timestamp);
replacedertFlinkManifests(1);
harness.notifyOfCompletedCheckpoint(checkpointId);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row));
replacedertSnapshotSize(1);
replacedertMaxCommittedCheckpointId(jobId, checkpointId);
}
// Restore from the given snapshot
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.initializeState(snapshot);
harness.open();
SimpleDataUtil.replacedertTableRows(table, expectedRows);
replacedertSnapshotSize(1);
replacedertMaxCommittedCheckpointId(jobId, checkpointId);
RowData row = SimpleDataUtil.createRowData(2, "world");
expectedRows.add(row);
DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row));
harness.processElement(of(dataFile), ++timestamp);
harness.snapshot(++checkpointId, ++timestamp);
replacedertFlinkManifests(1);
harness.notifyOfCompletedCheckpoint(checkpointId);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, expectedRows);
replacedertSnapshotSize(2);
replacedertMaxCommittedCheckpointId(jobId, checkpointId);
}
}
@Test
public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception {
// We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's possible that we
// flink job will restore from a checkpoint with only step#1 finished.
long checkpointId = 0;
long timestamp = 0;
OperatorSubtaskState snapshot;
List<RowData> expectedRows = Lists.newArrayList();
JobID jobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertSnapshotSize(0);
replacedertMaxCommittedCheckpointId(jobId, -1L);
RowData row = SimpleDataUtil.createRowData(1, "hello");
expectedRows.add(row);
DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row));
harness.processElement(of(dataFile), ++timestamp);
snapshot = harness.snapshot(++checkpointId, ++timestamp);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of());
replacedertMaxCommittedCheckpointId(jobId, -1L);
replacedertFlinkManifests(1);
}
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.initializeState(snapshot);
harness.open();
// All flink manifests should be cleaned because it has committed the unfinished iceberg transaction.
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, expectedRows);
replacedertMaxCommittedCheckpointId(jobId, checkpointId);
harness.snapshot(++checkpointId, ++timestamp);
// Did not write any new record, so it won't generate new manifest.
replacedertFlinkManifests(0);
harness.notifyOfCompletedCheckpoint(checkpointId);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, expectedRows);
replacedertSnapshotSize(2);
replacedertMaxCommittedCheckpointId(jobId, checkpointId);
RowData row = SimpleDataUtil.createRowData(2, "world");
expectedRows.add(row);
DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row));
harness.processElement(of(dataFile), ++timestamp);
snapshot = harness.snapshot(++checkpointId, ++timestamp);
replacedertFlinkManifests(1);
}
// Redeploying flink job from external checkpoint.
JobID newJobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(newJobId)) {
harness.setup();
harness.initializeState(snapshot);
harness.open();
// All flink manifests should be cleaned because it has committed the unfinished iceberg transaction.
replacedertFlinkManifests(0);
replacedertMaxCommittedCheckpointId(newJobId, -1);
replacedertMaxCommittedCheckpointId(jobId, checkpointId);
SimpleDataUtil.replacedertTableRows(table, expectedRows);
replacedertSnapshotSize(3);
RowData row = SimpleDataUtil.createRowData(3, "foo");
expectedRows.add(row);
DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row));
harness.processElement(of(dataFile), ++timestamp);
harness.snapshot(++checkpointId, ++timestamp);
replacedertFlinkManifests(1);
harness.notifyOfCompletedCheckpoint(checkpointId);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, expectedRows);
replacedertSnapshotSize(4);
replacedertMaxCommittedCheckpointId(newJobId, checkpointId);
}
}
@Test
public void testStartAnotherJobToWriteSameTable() throws Exception {
long checkpointId = 0;
long timestamp = 0;
List<RowData> rows = Lists.newArrayList();
List<RowData> tableRows = Lists.newArrayList();
JobID oldJobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(oldJobId)) {
harness.setup();
harness.open();
replacedertSnapshotSize(0);
replacedertMaxCommittedCheckpointId(oldJobId, -1L);
for (int i = 1; i <= 3; i++) {
rows.add(SimpleDataUtil.createRowData(i, "hello" + i));
tableRows.addAll(rows);
DataFile dataFile = writeDataFile(String.format("data-%d", i), rows);
harness.processElement(of(dataFile), ++timestamp);
harness.snapshot(++checkpointId, ++timestamp);
replacedertFlinkManifests(1);
harness.notifyOfCompletedCheckpoint(checkpointId);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, tableRows);
replacedertSnapshotSize(i);
replacedertMaxCommittedCheckpointId(oldJobId, checkpointId);
}
}
// The new started job will start with checkpoint = 1 again.
checkpointId = 0;
timestamp = 0;
JobID newJobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(newJobId)) {
harness.setup();
harness.open();
replacedertSnapshotSize(3);
replacedertMaxCommittedCheckpointId(oldJobId, 3);
replacedertMaxCommittedCheckpointId(newJobId, -1);
rows.add(SimpleDataUtil.createRowData(2, "world"));
tableRows.addAll(rows);
DataFile dataFile = writeDataFile("data-new-1", rows);
harness.processElement(of(dataFile), ++timestamp);
harness.snapshot(++checkpointId, ++timestamp);
replacedertFlinkManifests(1);
harness.notifyOfCompletedCheckpoint(checkpointId);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, tableRows);
replacedertSnapshotSize(4);
replacedertMaxCommittedCheckpointId(newJobId, checkpointId);
}
}
@Test
public void testMultipleJobsWriteSameTable() throws Exception {
long timestamp = 0;
List<RowData> tableRows = Lists.newArrayList();
JobID[] jobs = new JobID[] { new JobID(), new JobID(), new JobID() };
for (int i = 0; i < 20; i++) {
int jobIndex = i % 3;
int checkpointId = i / 3;
JobID jobId = jobs[jobIndex];
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertSnapshotSize(i);
replacedertMaxCommittedCheckpointId(jobId, checkpointId == 0 ? -1 : checkpointId);
List<RowData> rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i));
tableRows.addAll(rows);
DataFile dataFile = writeDataFile(String.format("data-%d", i), rows);
harness.processElement(of(dataFile), ++timestamp);
harness.snapshot(checkpointId + 1, ++timestamp);
replacedertFlinkManifests(1);
harness.notifyOfCompletedCheckpoint(checkpointId + 1);
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, tableRows);
replacedertSnapshotSize(i + 1);
replacedertMaxCommittedCheckpointId(jobId, checkpointId + 1);
}
}
}
@Test
public void testBoundedStream() throws Exception {
JobID jobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertFlinkManifests(0);
replacedertSnapshotSize(0);
replacedertMaxCommittedCheckpointId(jobId, -1L);
List<RowData> tableRows = Lists.newArrayList(SimpleDataUtil.createRowData(1, "word-1"));
DataFile dataFile = writeDataFile("data-1", tableRows);
harness.processElement(of(dataFile), 1);
((BoundedOneInput) harness.getOneInputOperator()).endInput();
replacedertFlinkManifests(0);
SimpleDataUtil.replacedertTableRows(table, tableRows);
replacedertSnapshotSize(1);
replacedertMaxCommittedCheckpointId(jobId, Long.MAX_VALUE);
}
}
@Test
public void testFlinkManifests() throws Exception {
long timestamp = 0;
final long checkpoint = 10;
JobID jobId = new JobID();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertMaxCommittedCheckpointId(jobId, -1L);
RowData row1 = SimpleDataUtil.createRowData(1, "hello");
DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1));
harness.processElement(of(dataFile1), ++timestamp);
replacedertMaxCommittedCheckpointId(jobId, -1L);
// 1. snapshotState for checkpoint#1
harness.snapshot(checkpoint, ++timestamp);
List<Path> manifestPaths = replacedertFlinkManifests(1);
Path manifestPath = manifestPaths.get(0);
replacedert.replacedertEquals("File name should have the expected pattern.", String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString());
// 2. Read the data files from manifests and replacedert.
List<DataFile> dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io());
replacedert.replacedertEquals(1, dataFiles.size());
TestFlinkManifest.checkContentFile(dataFile1, dataFiles.get(0));
// 3. notifyCheckpointComplete for checkpoint#1
harness.notifyOfCompletedCheckpoint(checkpoint);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row1));
replacedertMaxCommittedCheckpointId(jobId, checkpoint);
replacedertFlinkManifests(0);
}
}
@Test
public void testDeleteFiles() throws Exception {
replacedume.replacedumeFalse("Only support equality-delete in format v2.", formatVersion < 2);
long timestamp = 0;
long checkpoint = 10;
JobID jobId = new JobID();
FileAppenderFactory<RowData> appenderFactory = createDeletableAppenderFactory();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertMaxCommittedCheckpointId(jobId, -1L);
RowData row1 = SimpleDataUtil.createInsert(1, "aaa");
DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1));
harness.processElement(of(dataFile1), ++timestamp);
replacedertMaxCommittedCheckpointId(jobId, -1L);
// 1. snapshotState for checkpoint#1
harness.snapshot(checkpoint, ++timestamp);
List<Path> manifestPaths = replacedertFlinkManifests(1);
Path manifestPath = manifestPaths.get(0);
replacedert.replacedertEquals("File name should have the expected pattern.", String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString());
// 2. Read the data files from manifests and replacedert.
List<DataFile> dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io());
replacedert.replacedertEquals(1, dataFiles.size());
TestFlinkManifest.checkContentFile(dataFile1, dataFiles.get(0));
// 3. notifyCheckpointComplete for checkpoint#1
harness.notifyOfCompletedCheckpoint(checkpoint);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row1));
replacedertMaxCommittedCheckpointId(jobId, checkpoint);
replacedertFlinkManifests(0);
// 4. process both data files and delete files.
RowData row2 = SimpleDataUtil.createInsert(2, "bbb");
DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2));
RowData delete1 = SimpleDataUtil.createDelete(1, "aaa");
DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1));
harness.processElement(WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build(), ++timestamp);
replacedertMaxCommittedCheckpointId(jobId, checkpoint);
// 5. snapshotState for checkpoint#2
harness.snapshot(++checkpoint, ++timestamp);
replacedertFlinkManifests(2);
// 6. notifyCheckpointComplete for checkpoint#2
harness.notifyOfCompletedCheckpoint(checkpoint);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(row2));
replacedertMaxCommittedCheckpointId(jobId, checkpoint);
replacedertFlinkManifests(0);
}
}
@Test
public void testValidateDataFileExist() throws Exception {
replacedume.replacedumeFalse("Only support equality-delete in format v2.", formatVersion < 2);
long timestamp = 0;
long checkpoint = 10;
JobID jobId = new JobID();
FileAppenderFactory<RowData> appenderFactory = createDeletableAppenderFactory();
RowData insert1 = SimpleDataUtil.createInsert(1, "aaa");
DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1));
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
// Txn#1: insert the row <1, 'aaa'>
harness.processElement(WriteResult.builder().addDataFiles(dataFile1).build(), ++timestamp);
harness.snapshot(checkpoint, ++timestamp);
harness.notifyOfCompletedCheckpoint(checkpoint);
// Txn#2: Overwrite the committed data-file-1
RowData insert2 = SimpleDataUtil.createInsert(2, "bbb");
DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert2));
new TestTableLoader(tablePath).loadTable().newOverwrite().addFile(dataFile2).deleteFile(dataFile1).commit();
}
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
// Txn#3: position-delete the <1, 'aaa'> (NOT committed).
DeleteFile deleteFile1 = writePosDeleteFile(appenderFactory, "pos-delete-file-1", ImmutableList.of(Pair.of(dataFile1.path(), 0L)));
harness.processElement(WriteResult.builder().addDeleteFiles(deleteFile1).addReferencedDataFiles(dataFile1.path()).build(), ++timestamp);
harness.snapshot(++checkpoint, ++timestamp);
// Txn#3: validate will be failure when committing.
final long currentCheckpointId = checkpoint;
replacedertHelpers.replacedertThrows("Validation should be failure because of non-exist data files.", ValidationException.clreplaced, "Cannot commit, missing data files", () -> {
harness.notifyOfCompletedCheckpoint(currentCheckpointId);
return null;
});
}
}
@Test
public void testCommitTwoCheckpointsInSingleTxn() throws Exception {
replacedume.replacedumeFalse("Only support equality-delete in format v2.", formatVersion < 2);
long timestamp = 0;
long checkpoint = 10;
JobID jobId = new JobID();
FileAppenderFactory<RowData> appenderFactory = createDeletableAppenderFactory();
try (OneInputStreamOperatorTestHarness<WriteResult, Void> harness = createStreamSink(jobId)) {
harness.setup();
harness.open();
replacedertMaxCommittedCheckpointId(jobId, -1L);
RowData insert1 = SimpleDataUtil.createInsert(1, "aaa");
RowData insert2 = SimpleDataUtil.createInsert(2, "bbb");
RowData delete3 = SimpleDataUtil.createDelete(3, "ccc");
DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2));
DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3));
harness.processElement(WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build(), ++timestamp);
// The 1th snapshotState.
harness.snapshot(checkpoint, ++timestamp);
RowData insert4 = SimpleDataUtil.createInsert(4, "ddd");
RowData delete2 = SimpleDataUtil.createDelete(2, "bbb");
DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4));
DeleteFile deleteFile2 = writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2));
harness.processElement(WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build(), ++timestamp);
// The 2nd snapshotState.
harness.snapshot(++checkpoint, ++timestamp);
// Notify the 2nd snapshot to complete.
harness.notifyOfCompletedCheckpoint(checkpoint);
SimpleDataUtil.replacedertTableRows(table, ImmutableList.of(insert1, insert4));
replacedertMaxCommittedCheckpointId(jobId, checkpoint);
replacedertFlinkManifests(0);
replacedert.replacedertEquals("Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size());
}
}
private DeleteFile writeEqDeleteFile(FileAppenderFactory<RowData> appenderFactory, String filename, List<RowData> deletes) throws IOException {
return SimpleDataUtil.writeEqDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes);
}
private DeleteFile writePosDeleteFile(FileAppenderFactory<RowData> appenderFactory, String filename, List<Pair<CharSequence, Long>> positions) throws IOException {
return SimpleDataUtil.writePosDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions);
}
private FileAppenderFactory<RowData> createDeletableAppenderFactory() {
int[] equalityFieldIds = new int[] { table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() };
return new FlinkAppenderFactory(table.schema(), FlinkSchemaUtil.convert(table.schema()), table.properties(), table.spec(), equalityFieldIds, table.schema(), null);
}
private ManifestFile createTestingManifestFile(Path manifestPath) {
return new GenericManifestFile(manifestPath.toAbsolutePath().toString(), manifestPath.toFile().length(), 0, ManifestContent.DATA, 0, 0, 0L, 0, 0, 0, 0, 0, 0, null);
}
private List<Path> replacedertFlinkManifests(int expectedCount) throws IOException {
List<Path> manifests = Files.list(flinkManifestFolder.toPath()).filter(p -> !p.toString().endsWith(".crc")).collect(Collectors.toList());
replacedert.replacedertEquals(String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), expectedCount, manifests.size());
return manifests;
}
private DataFile writeDataFile(String filename, List<RowData> rows) throws IOException {
return SimpleDataUtil.writeFile(table.schema(), table.spec(), CONF, tablePath, format.addExtension(filename), rows);
}
private void replacedertMaxCommittedCheckpointId(JobID jobID, long expectedId) {
table.refresh();
long actualId = IcebergFilesCommitter.getMaxCommittedCheckpointId(table, jobID.toString());
replacedert.replacedertEquals(expectedId, actualId);
}
private void replacedertSnapshotSize(int expectedSnapshotSize) {
table.refresh();
replacedert.replacedertEquals(expectedSnapshotSize, Lists.newArrayList(table.snapshots()).size());
}
private OneInputStreamOperatorTestHarness<WriteResult, Void> createStreamSink(JobID jobID) throws Exception {
TestOperatorFactory factory = TestOperatorFactory.of(tablePath);
return new OneInputStreamOperatorTestHarness<>(factory, createEnvironment(jobID));
}
private static MockEnvironment createEnvironment(JobID jobID) {
return new MockEnvironmentBuilder().setTaskName("test task").setManagedMemorySize(32 * 1024).setInputSplitProvider(new MockInputSplitProvider()).setBufferSize(256).setTaskConfiguration(new org.apache.flink.configuration.Configuration()).setExecutionConfig(new ExecutionConfig()).setMaxParallelism(16).setJobID(jobID).build();
}
private static clreplaced TestOperatorFactory extends AbstractStreamOperatorFactory<Void> implements OneInputStreamOperatorFactory<WriteResult, Void> {
private final String tablePath;
private TestOperatorFactory(String tablePath) {
this.tablePath = tablePath;
}
private static TestOperatorFactory of(String tablePath) {
return new TestOperatorFactory(tablePath);
}
@Override
@SuppressWarnings("unchecked")
public <T extends StreamOperator<Void>> T createStreamOperator(StreamOperatorParameters<Void> param) {
IcebergFilesCommitter committer = new IcebergFilesCommitter(new TestTableLoader(tablePath), false);
committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput());
return (T) committer;
}
@Override
public Clreplaced<? extends StreamOperator> getStreamOperatorClreplaced(ClreplacedLoader clreplacedLoader) {
return IcebergFilesCommitter.clreplaced;
}
}
}
17
Source : FlinkSink.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
static IcebergStreamWriter<RowData> createStreamWriter(Table table, RowType flinkRowType, List<Integer> equalityFieldIds) {
Map<String, String> props = table.properties();
long targetFileSize = getTargetFileSizeBytes(props);
FileFormat fileFormat = getFileFormat(props);
TaskWriterFactory<RowData> taskWriterFactory = new RowDataTaskWriterFactory(table.schema(), flinkRowType, table.spec(), table.locationProvider(), table.io(), table.encryption(), targetFileSize, fileFormat, props, equalityFieldIds);
return new IcebergStreamWriter<>(table.name(), taskWriterFactory);
}
17
Source : TestTaskEqualityDeltaWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestTaskEqualityDeltaWriter extends TableTestBase {
private static final int FORMAT_V2 = 2;
private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024L;
private final FileFormat format;
private final GenericRecord gRecord = GenericRecord.create(SCHEMA);
private final GenericRecord posRecord = GenericRecord.create(DeleteSchemaUtil.pathPosSchema());
private OutputFileFactory fileFactory = null;
private int idFieldId;
private int dataFieldId;
@Parameterized.Parameters(name = "FileFormat = {0}")
public static Object[][] parameters() {
return new Object[][] { { "avro" }, { "parquet" } };
}
public TestTaskEqualityDeltaWriter(String fileFormat) {
super(FORMAT_V2);
this.format = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH));
}
@Before
public void setupTable() throws IOException {
this.tableDir = temp.newFolder();
// created by table create
replacedert.replacedertTrue(tableDir.delete());
this.metadataDir = new File(tableDir, "metadata");
this.table = create(SCHEMA, ParreplacedionSpec.unparreplacedioned());
this.fileFactory = new OutputFileFactory(table.spec(), format, table.locationProvider(), table.io(), table.encryption(), 1, 1);
this.idFieldId = table.schema().findField("id").fieldId();
this.dataFieldId = table.schema().findField("data").fieldId();
table.updateProperties().defaultFormat(format).commit();
}
private Record createRecord(Integer id, String data) {
return gRecord.copy("id", id, "data", data);
}
@Test
public void testPureInsert() throws IOException {
List<Integer> eqDeleteFieldIds = Lists.newArrayList(idFieldId, dataFieldId);
Schema eqDeleteRowSchema = table.schema();
GenericTaskDeltaWriter deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
List<Record> expected = Lists.newArrayList();
for (int i = 0; i < 20; i++) {
Record record = createRecord(i, String.format("val-%d", i));
expected.add(record);
deltaWriter.write(record);
}
WriteResult result = deltaWriter.complete();
replacedert.replacedertEquals("Should only have a data file.", 1, result.dataFiles().length);
replacedert.replacedertEquals("Should have no delete file", 0, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*"));
deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
for (int i = 20; i < 30; i++) {
Record record = createRecord(i, String.format("val-%d", i));
expected.add(record);
deltaWriter.write(record);
}
result = deltaWriter.complete();
replacedert.replacedertEquals("Should only have a data file.", 1, result.dataFiles().length);
replacedert.replacedertEquals("Should have no delete file", 0, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*"));
}
@Test
public void testInsertDuplicatedKey() throws IOException {
List<Integer> equalityFieldIds = Lists.newArrayList(idFieldId);
Schema eqDeleteRowSchema = table.schema();
GenericTaskDeltaWriter deltaWriter = createTaskWriter(equalityFieldIds, eqDeleteRowSchema);
deltaWriter.write(createRecord(1, "aaa"));
deltaWriter.write(createRecord(2, "bbb"));
deltaWriter.write(createRecord(3, "ccc"));
deltaWriter.write(createRecord(4, "ddd"));
deltaWriter.write(createRecord(4, "eee"));
deltaWriter.write(createRecord(3, "fff"));
deltaWriter.write(createRecord(2, "ggg"));
deltaWriter.write(createRecord(1, "hhh"));
WriteResult result = deltaWriter.complete();
commitTransaction(result);
replacedert.replacedertEquals("Should have a data file.", 1, result.dataFiles().length);
replacedert.replacedertEquals("Should have a pos-delete file", 1, result.deleteFiles().length);
DeleteFile posDeleteFile = result.deleteFiles()[0];
replacedert.replacedertEquals("Should be a pos-delete file", FileContent.POSITION_DELETES, posDeleteFile.content());
replacedert.replacedertEquals(1, result.referencedDataFiles().length);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(ImmutableList.of(createRecord(4, "eee"), createRecord(3, "fff"), createRecord(2, "ggg"), createRecord(1, "hhh"))), actualRowSet("*"));
// Check records in the data file.
DataFile dataFile = result.dataFiles()[0];
replacedert.replacedertEquals(ImmutableList.of(createRecord(1, "aaa"), createRecord(2, "bbb"), createRecord(3, "ccc"), createRecord(4, "ddd"), createRecord(4, "eee"), createRecord(3, "fff"), createRecord(2, "ggg"), createRecord(1, "hhh")), readRecordsAsList(table.schema(), dataFile.path()));
// Check records in the pos-delete file.
Schema posDeleteSchema = DeleteSchemaUtil.pathPosSchema();
replacedert.replacedertEquals(ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L), posRecord.copy("file_path", dataFile.path(), "pos", 1L), posRecord.copy("file_path", dataFile.path(), "pos", 2L), posRecord.copy("file_path", dataFile.path(), "pos", 3L)), readRecordsAsList(posDeleteSchema, posDeleteFile.path()));
}
@Test
public void testUpsertSameRow() throws IOException {
List<Integer> eqDeleteFieldIds = Lists.newArrayList(idFieldId, dataFieldId);
Schema eqDeleteRowSchema = table.schema();
GenericTaskDeltaWriter deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
Record record = createRecord(1, "aaa");
deltaWriter.write(record);
// UPSERT <1, 'aaa'> to <1, 'aaa'>
deltaWriter.delete(record);
deltaWriter.write(record);
WriteResult result = deltaWriter.complete();
replacedert.replacedertEquals("Should have a data file.", 1, result.dataFiles().length);
replacedert.replacedertEquals("Should have a pos-delete file and an eq-delete file", 2, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have an expected record", expectedRowSet(ImmutableList.of(record)), actualRowSet("*"));
// Check records in the data file.
DataFile dataFile = result.dataFiles()[0];
replacedert.replacedertEquals(ImmutableList.of(record, record), readRecordsAsList(table.schema(), dataFile.path()));
// Check records in the eq-delete file.
DeleteFile eqDeleteFile = result.deleteFiles()[0];
replacedert.replacedertEquals(ImmutableList.of(record), readRecordsAsList(eqDeleteRowSchema, eqDeleteFile.path()));
// Check records in the pos-delete file.
DeleteFile posDeleteFile = result.deleteFiles()[1];
replacedert.replacedertEquals(ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L)), readRecordsAsList(DeleteSchemaUtil.pathPosSchema(), posDeleteFile.path()));
deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
deltaWriter.delete(record);
result = deltaWriter.complete();
replacedert.replacedertEquals("Should have 0 data file.", 0, result.dataFiles().length);
replacedert.replacedertEquals("Should have 1 eq-delete file", 1, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have no record", expectedRowSet(ImmutableList.of()), actualRowSet("*"));
}
@Test
public void testUpsertData() throws IOException {
List<Integer> eqDeleteFieldIds = Lists.newArrayList(dataFieldId);
Schema eqDeleteRowSchema = table.schema().select("data");
GenericTaskDeltaWriter deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
deltaWriter.write(createRecord(1, "aaa"));
deltaWriter.write(createRecord(2, "bbb"));
deltaWriter.write(createRecord(3, "aaa"));
deltaWriter.write(createRecord(3, "ccc"));
deltaWriter.write(createRecord(4, "ccc"));
// Commit the 1th transaction.
WriteResult result = deltaWriter.complete();
replacedert.replacedertEquals("Should have a data file", 1, result.dataFiles().length);
replacedert.replacedertEquals("Should have a pos-delete file for deduplication purpose", 1, result.deleteFiles().length);
replacedert.replacedertEquals("Should be pos-delete file", FileContent.POSITION_DELETES, result.deleteFiles()[0].content());
replacedert.replacedertEquals(1, result.referencedDataFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(ImmutableList.of(createRecord(2, "bbb"), createRecord(3, "aaa"), createRecord(4, "ccc"))), actualRowSet("*"));
// Start the 2nd transaction.
deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
GenericRecord keyRecord = GenericRecord.create(eqDeleteRowSchema);
Function<String, Record> keyFunc = data -> keyRecord.copy("data", data);
// UPSERT <3,'aaa'> to <5,'aaa'> - (by delete the key)
deltaWriter.deleteKey(keyFunc.apply("aaa"));
deltaWriter.write(createRecord(5, "aaa"));
// UPSERT <5,'aaa'> to <6,'aaa'> - (by delete the key)
deltaWriter.deleteKey(keyFunc.apply("aaa"));
deltaWriter.write(createRecord(6, "aaa"));
// UPSERT <4,'ccc'> to <7,'ccc'> - (by delete the key)
deltaWriter.deleteKey(keyFunc.apply("ccc"));
deltaWriter.write(createRecord(7, "ccc"));
// DELETE <2, 'bbb'> - (by delete the key)
deltaWriter.deleteKey(keyFunc.apply("bbb"));
// Commit the 2nd transaction.
result = deltaWriter.complete();
replacedert.replacedertEquals(1, result.dataFiles().length);
replacedert.replacedertEquals(2, result.deleteFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(ImmutableList.of(createRecord(6, "aaa"), createRecord(7, "ccc"))), actualRowSet("*"));
// Check records in the data file.
DataFile dataFile = result.dataFiles()[0];
replacedert.replacedertEquals(ImmutableList.of(createRecord(5, "aaa"), createRecord(6, "aaa"), createRecord(7, "ccc")), readRecordsAsList(table.schema(), dataFile.path()));
// Check records in the eq-delete file.
DeleteFile eqDeleteFile = result.deleteFiles()[0];
replacedert.replacedertEquals(FileContent.EQUALITY_DELETES, eqDeleteFile.content());
replacedert.replacedertEquals(ImmutableList.of(keyFunc.apply("aaa"), keyFunc.apply("aaa"), keyFunc.apply("ccc"), keyFunc.apply("bbb")), readRecordsAsList(eqDeleteRowSchema, eqDeleteFile.path()));
// Check records in the pos-delete file.
DeleteFile posDeleteFile = result.deleteFiles()[1];
Schema posDeleteSchema = DeleteSchemaUtil.pathPosSchema();
replacedert.replacedertEquals(FileContent.POSITION_DELETES, posDeleteFile.content());
replacedert.replacedertEquals(ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L)), readRecordsAsList(posDeleteSchema, posDeleteFile.path()));
}
@Test
public void testUpsertDataWithFullRowSchema() throws IOException {
List<Integer> eqDeleteFieldIds = Lists.newArrayList(dataFieldId);
Schema eqDeleteRowSchema = table.schema();
GenericTaskDeltaWriter deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
deltaWriter.write(createRecord(1, "aaa"));
deltaWriter.write(createRecord(2, "bbb"));
deltaWriter.write(createRecord(3, "aaa"));
deltaWriter.write(createRecord(3, "ccc"));
deltaWriter.write(createRecord(4, "ccc"));
// Commit the 1th transaction.
WriteResult result = deltaWriter.complete();
replacedert.replacedertEquals("Should have a data file", 1, result.dataFiles().length);
replacedert.replacedertEquals("Should have a pos-delete file for deduplication purpose", 1, result.deleteFiles().length);
replacedert.replacedertEquals("Should be pos-delete file", FileContent.POSITION_DELETES, result.deleteFiles()[0].content());
replacedert.replacedertEquals(1, result.referencedDataFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(ImmutableList.of(createRecord(2, "bbb"), createRecord(3, "aaa"), createRecord(4, "ccc"))), actualRowSet("*"));
// Start the 2nd transaction.
deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema);
// UPSERT <3,'aaa'> to <5,'aaa'> - (by delete the entire row)
deltaWriter.delete(createRecord(3, "aaa"));
deltaWriter.write(createRecord(5, "aaa"));
// UPSERT <5,'aaa'> to <6,'aaa'> - (by delete the entire row)
deltaWriter.delete(createRecord(5, "aaa"));
deltaWriter.write(createRecord(6, "aaa"));
// UPSERT <4,'ccc'> to <7,'ccc'> - (by delete the entire row)
deltaWriter.delete(createRecord(4, "ccc"));
deltaWriter.write(createRecord(7, "ccc"));
// DELETE <2, 'bbb'> - (by delete the entire row)
deltaWriter.delete(createRecord(2, "bbb"));
// Commit the 2nd transaction.
result = deltaWriter.complete();
replacedert.replacedertEquals(1, result.dataFiles().length);
replacedert.replacedertEquals(2, result.deleteFiles().length);
replacedert.replacedertEquals(1, result.referencedDataFiles().length);
commitTransaction(result);
replacedert.replacedertEquals("Should have expected records", expectedRowSet(ImmutableList.of(createRecord(6, "aaa"), createRecord(7, "ccc"))), actualRowSet("*"));
// Check records in the data file.
DataFile dataFile = result.dataFiles()[0];
replacedert.replacedertEquals(ImmutableList.of(createRecord(5, "aaa"), createRecord(6, "aaa"), createRecord(7, "ccc")), readRecordsAsList(table.schema(), dataFile.path()));
// Check records in the eq-delete file.
DeleteFile eqDeleteFile = result.deleteFiles()[0];
replacedert.replacedertEquals(FileContent.EQUALITY_DELETES, eqDeleteFile.content());
replacedert.replacedertEquals(ImmutableList.of(createRecord(3, "aaa"), createRecord(5, "aaa"), createRecord(4, "ccc"), createRecord(2, "bbb")), readRecordsAsList(eqDeleteRowSchema, eqDeleteFile.path()));
// Check records in the pos-delete file.
DeleteFile posDeleteFile = result.deleteFiles()[1];
Schema posDeleteSchema = DeleteSchemaUtil.pathPosSchema();
replacedert.replacedertEquals(FileContent.POSITION_DELETES, posDeleteFile.content());
replacedert.replacedertEquals(ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L)), readRecordsAsList(posDeleteSchema, posDeleteFile.path()));
}
private void commitTransaction(WriteResult result) {
RowDelta rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
rowDelta.validateDeletedFiles().validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())).commit();
}
private StructLikeSet expectedRowSet(Iterable<Record> records) {
StructLikeSet set = StructLikeSet.create(table.schema().replacedtruct());
records.forEach(set::add);
return set;
}
private StructLikeSet actualRowSet(String... columns) throws IOException {
StructLikeSet set = StructLikeSet.create(table.schema().replacedtruct());
try (CloseableIterable<Record> reader = IcebergGenerics.read(table).select(columns).build()) {
reader.forEach(set::add);
}
return set;
}
/**
* Create a generic task equality delta writer.
*
* @param equalityFieldIds defines the equality field ids.
* @param eqDeleteRowSchema defines the schema of rows that eq-delete writer will write, it could be the entire fields
* of the table schema.
*/
private GenericTaskDeltaWriter createTaskWriter(List<Integer> equalityFieldIds, Schema eqDeleteRowSchema) {
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), ArrayUtil.toIntArray(equalityFieldIds), eqDeleteRowSchema, null);
List<String> columns = Lists.newArrayList();
for (Integer fieldId : equalityFieldIds) {
columns.add(table.schema().findField(fieldId).name());
}
Schema deleteSchema = table.schema().select(columns);
return new GenericTaskDeltaWriter(table.schema(), deleteSchema, table.spec(), format, appenderFactory, fileFactory, table.io(), TARGET_FILE_SIZE);
}
private static clreplaced GenericTaskDeltaWriter extends BaseTaskWriter<Record> {
private final GenericEqualityDeltaWriter deltaWriter;
private GenericTaskDeltaWriter(Schema schema, Schema deleteSchema, ParreplacedionSpec spec, FileFormat format, FileAppenderFactory<Record> appenderFactory, OutputFileFactory fileFactory, FileIO io, long targetFileSize) {
super(spec, format, appenderFactory, fileFactory, io, targetFileSize);
this.deltaWriter = new GenericEqualityDeltaWriter(null, schema, deleteSchema);
}
@Override
public void write(Record row) throws IOException {
deltaWriter.write(row);
}
public void delete(Record row) throws IOException {
deltaWriter.delete(row);
}
public void deleteKey(Record key) throws IOException {
deltaWriter.deleteKey(key);
}
@Override
public void close() throws IOException {
deltaWriter.close();
}
private clreplaced GenericEqualityDeltaWriter extends BaseEqualityDeltaWriter {
private GenericEqualityDeltaWriter(ParreplacedionKey parreplacedion, Schema schema, Schema eqDeleteSchema) {
super(parreplacedion, schema, eqDeleteSchema);
}
@Override
protected StructLike replacedtructLike(Record row) {
return row;
}
}
}
private List<Record> readRecordsAsList(Schema schema, CharSequence path) throws IOException {
CloseableIterable<Record> iterable;
InputFile inputFile = Files.localInput(path.toString());
switch(format) {
case PARQUET:
iterable = Parquet.read(inputFile).project(schema).createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)).build();
break;
case AVRO:
iterable = Avro.read(inputFile).project(schema).createReaderFunc(DataReader::create).build();
break;
default:
throw new UnsupportedOperationException("Unsupported file format: " + format);
}
try (CloseableIterable<Record> closeableIterable = iterable) {
return Lists.newArrayList(closeableIterable);
}
}
}
17
Source : TestGenericSortedPosDeleteWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestGenericSortedPosDeleteWriter extends TableTestBase {
private static final int FORMAT_V2 = 2;
private final FileFormat format;
private OutputFileFactory fileFactory;
private Record gRecord;
@Parameterized.Parameters(name = "FileFormat={0}")
public static Object[] parameters() {
return new Object[][] { new Object[] { "avro" }, new Object[] { "parquet" } };
}
public TestGenericSortedPosDeleteWriter(String fileFormat) {
super(FORMAT_V2);
this.format = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH));
}
@Before
public void setupTable() throws IOException {
this.tableDir = temp.newFolder();
replacedert.replacedertTrue(tableDir.delete());
this.metadataDir = new File(tableDir, "metadata");
this.table = create(SCHEMA, ParreplacedionSpec.unparreplacedioned());
this.gRecord = GenericRecord.create(SCHEMA);
this.fileFactory = new OutputFileFactory(table.spec(), format, table.locationProvider(), table.io(), table.encryption(), 1, 1);
table.updateProperties().defaultFormat(format).commit();
}
private EncryptedOutputFile createEncryptedOutputFile() {
return fileFactory.newOutputFile();
}
private DataFile prepareDataFile(FileAppenderFactory<Record> appenderFactory, List<Record> rowSet) throws IOException {
DataWriter<Record> writer = appenderFactory.newDataWriter(createEncryptedOutputFile(), format, null);
try (DataWriter<Record> closeableWriter = writer) {
for (Record record : rowSet) {
closeableWriter.add(record);
}
}
return writer.toDataFile();
}
private Record createRow(Integer id, String data) {
Record row = gRecord.copy();
row.setField("id", id);
row.setField("data", data);
return row;
}
private StructLikeSet expectedRowSet(Iterable<Record> records) {
StructLikeSet set = StructLikeSet.create(table.schema().replacedtruct());
records.forEach(set::add);
return set;
}
private StructLikeSet actualRowSet(String... columns) throws IOException {
StructLikeSet set = StructLikeSet.create(table.schema().replacedtruct());
try (CloseableIterable<Record> reader = IcebergGenerics.read(table).select(columns).build()) {
reader.forEach(set::add);
}
return set;
}
@Test
public void testSortedPosDelete() throws IOException {
List<Record> rowSet = Lists.newArrayList(createRow(0, "aaa"), createRow(1, "bbb"), createRow(2, "ccc"), createRow(3, "ddd"), createRow(4, "eee"));
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, null);
DataFile dataFile = prepareDataFile(appenderFactory, rowSet);
SortedPosDeleteWriter<Record> writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 100);
try (SortedPosDeleteWriter<Record> closeableWriter = writer) {
for (int index = rowSet.size() - 1; index >= 0; index -= 2) {
closeableWriter.delete(dataFile.path(), index);
}
}
List<DeleteFile> deleteFiles = writer.complete();
replacedert.replacedertEquals(1, deleteFiles.size());
DeleteFile deleteFile = deleteFiles.get(0);
// Check whether the path-pos pairs are sorted as expected.
Schema pathPosSchema = DeleteSchemaUtil.pathPosSchema();
Record record = GenericRecord.create(pathPosSchema);
List<Record> expectedDeletes = Lists.newArrayList(record.copy("file_path", dataFile.path(), "pos", 0L), record.copy("file_path", dataFile.path(), "pos", 2L), record.copy("file_path", dataFile.path(), "pos", 4L));
replacedert.replacedertEquals(expectedDeletes, readRecordsAsList(pathPosSchema, deleteFile.path()));
table.newRowDelta().addRows(dataFile).addDeletes(deleteFiles.get(0)).validateDataFilesExist(writer.referencedDataFiles()).validateDeletedFiles().commit();
List<Record> expectedData = Lists.newArrayList(createRow(1, "bbb"), createRow(3, "ddd"));
replacedert.replacedertEquals("Should have the expected records", expectedRowSet(expectedData), actualRowSet("*"));
}
@Test
public void testSortedPosDeleteWithSchemaAndNullRow() throws IOException {
List<Record> rowSet = Lists.newArrayList(createRow(0, "aaa"), createRow(1, "bbb"), createRow(2, "ccc"));
// Create a FileAppenderFactory which requires pos-delete row schema.
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, table.schema());
DataFile dataFile = prepareDataFile(appenderFactory, rowSet);
SortedPosDeleteWriter<Record> writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 1);
boolean caughtError = false;
try {
writer.delete(dataFile.path(), 0L);
} catch (Exception e) {
caughtError = true;
}
replacedert.replacedertTrue("Should fail because the appender are required non-null rows to write", caughtError);
}
@Test
public void testSortedPosDeleteWithRow() throws IOException {
List<Record> rowSet = Lists.newArrayList(createRow(0, "aaa"), createRow(1, "bbb"), createRow(2, "ccc"), createRow(3, "ddd"), createRow(4, "eee"));
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, table.schema());
DataFile dataFile = prepareDataFile(appenderFactory, rowSet);
SortedPosDeleteWriter<Record> writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 100);
try (SortedPosDeleteWriter<Record> closeableWriter = writer) {
for (int index = rowSet.size() - 1; index >= 0; index -= 2) {
// Write deletes with row.
closeableWriter.delete(dataFile.path(), index, rowSet.get(index));
}
}
List<DeleteFile> deleteFiles = writer.complete();
replacedert.replacedertEquals(1, deleteFiles.size());
DeleteFile deleteFile = deleteFiles.get(0);
// Check whether the path-pos pairs are sorted as expected.
Schema pathPosSchema = DeleteSchemaUtil.posDeleteSchema(table.schema());
Record record = GenericRecord.create(pathPosSchema);
List<Record> expectedDeletes = Lists.newArrayList(record.copy("file_path", dataFile.path(), "pos", 0L, "row", createRow(0, "aaa")), record.copy("file_path", dataFile.path(), "pos", 2L, "row", createRow(2, "ccc")), record.copy("file_path", dataFile.path(), "pos", 4L, "row", createRow(4, "eee")));
replacedert.replacedertEquals(expectedDeletes, readRecordsAsList(pathPosSchema, deleteFile.path()));
table.newRowDelta().addRows(dataFile).addDeletes(deleteFiles.get(0)).validateDataFilesExist(writer.referencedDataFiles()).validateDeletedFiles().commit();
List<Record> expectedData = Lists.newArrayList(createRow(1, "bbb"), createRow(3, "ddd"));
replacedert.replacedertEquals("Should have the expected records", expectedRowSet(expectedData), actualRowSet("*"));
}
@Test
public void testMultipleFlush() throws IOException {
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, null);
// It will produce 5 record lists, each list will write into a separate data file:
// The 1th file has: <0 , val-0> , <1 , val-1> , ... , <99 , val-99>
// The 2th file has: <100, val-100> , <101, val-101> , ... , <199, val-199>
// The 3th file has: <200, val-200> , <201, val-201> , ... , <299, val-299>
// The 4th file has: <300, val-300> , <301, val-301> , ... , <399, val-399>
// The 5th file has: <400, val-400> , <401, val-401> , ... , <499, val-499>
List<DataFile> dataFiles = Lists.newArrayList();
for (int fileIndex = 0; fileIndex < 5; fileIndex++) {
List<Record> recordList = Lists.newLinkedList();
for (int recordIndex = 0; recordIndex < 100; recordIndex++) {
int id = fileIndex * 100 + recordIndex;
recordList.add(createRow(id, String.format("val-%s", id)));
}
// Write the records and generate the data file.
dataFiles.add(prepareDataFile(appenderFactory, recordList));
}
// Commit those data files to iceberg table.
RowDelta rowDelta = table.newRowDelta();
dataFiles.forEach(rowDelta::addRows);
rowDelta.commit();
SortedPosDeleteWriter<Record> writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 50);
try (SortedPosDeleteWriter<Record> closeableWriter = writer) {
for (int pos = 0; pos < 100; pos++) {
for (int fileIndex = 4; fileIndex >= 0; fileIndex--) {
closeableWriter.delete(dataFiles.get(fileIndex).path(), pos);
}
}
}
List<DeleteFile> deleteFiles = writer.complete();
replacedert.replacedertEquals(10, deleteFiles.size());
Schema pathPosSchema = DeleteSchemaUtil.pathPosSchema();
Record record = GenericRecord.create(pathPosSchema);
for (int deleteFileIndex = 0; deleteFileIndex < 10; deleteFileIndex++) {
List<Record> expectedDeletes = Lists.newArrayList();
for (int dataFileIndex = 0; dataFileIndex < 5; dataFileIndex++) {
DataFile dataFile = dataFiles.get(dataFileIndex);
for (long pos = deleteFileIndex * 10; pos < deleteFileIndex * 10 + 10; pos++) {
expectedDeletes.add(record.copy("file_path", dataFile.path(), "pos", pos));
}
}
DeleteFile deleteFile = deleteFiles.get(deleteFileIndex);
replacedert.replacedertEquals(expectedDeletes, readRecordsAsList(pathPosSchema, deleteFile.path()));
}
rowDelta = table.newRowDelta();
deleteFiles.forEach(rowDelta::addDeletes);
rowDelta.commit();
replacedert.replacedertEquals("Should have no record.", expectedRowSet(ImmutableList.of()), actualRowSet("*"));
}
private List<Record> readRecordsAsList(Schema schema, CharSequence path) throws IOException {
CloseableIterable<Record> iterable;
InputFile inputFile = Files.localInput(path.toString());
switch(format) {
case PARQUET:
iterable = Parquet.read(inputFile).project(schema).createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)).build();
break;
case AVRO:
iterable = Avro.read(inputFile).project(schema).createReaderFunc(DataReader::create).build();
break;
default:
throw new UnsupportedOperationException("Unsupported file format: " + format);
}
try (CloseableIterable<Record> closeableIterable = iterable) {
return Lists.newArrayList(closeableIterable);
}
}
}
17
Source : TestBaseTaskWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestBaseTaskWriter extends TableTestBase {
private static final int FORMAT_V2 = 2;
private final FileFormat format;
private final GenericRecord gRecord = GenericRecord.create(SCHEMA);
private OutputFileFactory fileFactory = null;
private FileAppenderFactory<Record> appenderFactory = null;
@Parameterized.Parameters(name = "FileFormat = {0}")
public static Object[][] parameters() {
return new Object[][] { { "avro" }, { "parquet" } };
}
public TestBaseTaskWriter(String fileFormat) {
super(FORMAT_V2);
this.format = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH));
}
@Before
public void setupTable() throws IOException {
this.tableDir = temp.newFolder();
// created by table create
replacedert.replacedertTrue(tableDir.delete());
this.metadataDir = new File(tableDir, "metadata");
this.table = create(SCHEMA, ParreplacedionSpec.unparreplacedioned());
this.fileFactory = new OutputFileFactory(table.spec(), format, table.locationProvider(), table.io(), table.encryption(), 1, 1);
int firstFieldId = table.schema().findField("id").fieldId();
int secondFieldId = table.schema().findField("data").fieldId();
this.appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), new int[] { firstFieldId, secondFieldId }, table.schema(), null);
table.updateProperties().defaultFormat(format).commit();
}
private Record createRecord(Integer id, String data) {
return gRecord.copy("id", id, "data", data);
}
@Test
public void testWriteZeroRecord() throws IOException {
try (TestTaskWriter writer = createTaskWriter(128 * 1024 * 1024)) {
writer.close();
WriteResult result = writer.complete();
replacedert.replacedertEquals(0, result.dataFiles().length);
replacedert.replacedertEquals(0, result.deleteFiles().length);
writer.close();
result = writer.complete();
replacedert.replacedertEquals(0, result.dataFiles().length);
replacedert.replacedertEquals(0, result.deleteFiles().length);
}
}
@Test
public void testAbort() throws IOException {
List<Record> records = Lists.newArrayList();
for (int i = 0; i < 2000; i++) {
records.add(createRecord(i, "aaa"));
}
List<Path> files;
try (TestTaskWriter taskWriter = createTaskWriter(4)) {
for (Record record : records) {
taskWriter.write(record);
taskWriter.delete(record);
}
// Close the current opened files.
taskWriter.close();
// replacedert the current data file count.
files = Files.list(Paths.get(tableDir.getPath(), "data")).filter(p -> !p.toString().endsWith(".crc")).collect(Collectors.toList());
replacedert.replacedertEquals("Should have 4 files but the files are: " + files, 4, files.size());
// Abort to clean all delete files and data files.
taskWriter.abort();
}
for (Path path : files) {
replacedert.replacedertFalse(Files.exists(path));
}
}
@Test
public void testRollIfExceedTargetFileSize() throws IOException {
List<Record> records = Lists.newArrayListWithCapacity(8000);
for (int i = 0; i < 2000; i++) {
records.add(createRecord(i, "aaa"));
records.add(createRecord(i, "bbb"));
records.add(createRecord(i, "ccc"));
records.add(createRecord(i, "ddd"));
}
WriteResult result;
try (TaskWriter<Record> taskWriter = createTaskWriter(4)) {
for (Record record : records) {
taskWriter.write(record);
}
result = taskWriter.complete();
replacedert.replacedertEquals(8, result.dataFiles().length);
replacedert.replacedertEquals(0, result.deleteFiles().length);
}
RowDelta rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
rowDelta.commit();
List<Record> expected = Lists.newArrayList();
try (TestTaskWriter taskWriter = createTaskWriter(3)) {
for (Record record : records) {
// ex: UPSERT <0, 'aaa'> to <0, 'AAA'>
taskWriter.delete(record);
int id = record.get(0, Integer.clreplaced);
String data = record.get(1, String.clreplaced);
Record newRecord = createRecord(id, data.toUpperCase());
expected.add(newRecord);
taskWriter.write(newRecord);
}
result = taskWriter.complete();
replacedert.replacedertEquals(8, result.dataFiles().length);
replacedert.replacedertEquals(8, result.deleteFiles().length);
}
rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
rowDelta.commit();
replacedert.replacedertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*"));
}
private StructLikeSet expectedRowSet(Iterable<Record> records) {
StructLikeSet set = StructLikeSet.create(table.schema().replacedtruct());
records.forEach(set::add);
return set;
}
private StructLikeSet actualRowSet(String... columns) throws IOException {
StructLikeSet set = StructLikeSet.create(table.schema().replacedtruct());
try (CloseableIterable<Record> reader = IcebergGenerics.read(table).select(columns).build()) {
reader.forEach(set::add);
}
return set;
}
private TestTaskWriter createTaskWriter(long targetFileSize) {
return new TestTaskWriter(table.spec(), format, appenderFactory, fileFactory, table.io(), targetFileSize);
}
private static clreplaced TestTaskWriter extends BaseTaskWriter<Record> {
private RollingFileWriter dataWriter;
private RollingEqDeleteWriter deleteWriter;
private TestTaskWriter(ParreplacedionSpec spec, FileFormat format, FileAppenderFactory<Record> appenderFactory, OutputFileFactory fileFactory, FileIO io, long targetFileSize) {
super(spec, format, appenderFactory, fileFactory, io, targetFileSize);
this.dataWriter = new RollingFileWriter(null);
this.deleteWriter = new RollingEqDeleteWriter(null);
}
@Override
public void write(Record row) throws IOException {
dataWriter.write(row);
}
void delete(Record row) throws IOException {
deleteWriter.write(row);
}
@Override
public void close() throws IOException {
if (dataWriter != null) {
dataWriter.close();
}
if (deleteWriter != null) {
deleteWriter.close();
}
}
}
}
17
Source : TestAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public abstract clreplaced TestAppenderFactory<T> extends TableTestBase {
private static final int FORMAT_V2 = 2;
private final FileFormat format;
private final boolean parreplacedioned;
private ParreplacedionKey parreplacedion = null;
private OutputFileFactory fileFactory = null;
@Parameterized.Parameters(name = "FileFormat={0}, Parreplacedioned={1}")
public static Object[] parameters() {
return new Object[][] { new Object[] { "avro", false }, new Object[] { "avro", true }, new Object[] { "parquet", false }, new Object[] { "parquet", true } };
}
public TestAppenderFactory(String fileFormat, boolean parreplacedioned) {
super(FORMAT_V2);
this.format = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH));
this.parreplacedioned = parreplacedioned;
}
@Before
public void setupTable() throws Exception {
this.tableDir = temp.newFolder();
// created by table create
replacedert.replacedertTrue(tableDir.delete());
this.metadataDir = new File(tableDir, "metadata");
if (parreplacedioned) {
this.table = create(SCHEMA, SPEC);
} else {
this.table = create(SCHEMA, ParreplacedionSpec.unparreplacedioned());
}
this.parreplacedion = createParreplacedionKey();
this.fileFactory = new OutputFileFactory(table.spec(), format, table.locationProvider(), table.io(), table.encryption(), 1, 1);
table.updateProperties().defaultFormat(format).commit();
}
protected abstract FileAppenderFactory<T> createAppenderFactory(List<Integer> equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema);
protected abstract T createRow(Integer id, String data);
protected abstract StructLikeSet expectedRowSet(Iterable<T> records) throws IOException;
private StructLikeSet actualRowSet(String... columns) throws IOException {
StructLikeSet set = StructLikeSet.create(table.schema().replacedtruct());
try (CloseableIterable<Record> reader = IcebergGenerics.read(table).select(columns).build()) {
reader.forEach(set::add);
}
return set;
}
private ParreplacedionKey createParreplacedionKey() {
if (table.spec().isUnparreplacedioned()) {
return null;
}
Record record = GenericRecord.create(table.schema()).copy(ImmutableMap.of("data", "aaa"));
ParreplacedionKey parreplacedionKey = new ParreplacedionKey(table.spec(), table.schema());
parreplacedionKey.parreplacedion(record);
return parreplacedionKey;
}
private EncryptedOutputFile createEncryptedOutputFile() {
if (parreplacedion == null) {
return fileFactory.newOutputFile();
} else {
return fileFactory.newOutputFile(parreplacedion);
}
}
private List<T> testRowSet() {
return Lists.newArrayList(createRow(1, "aaa"), createRow(2, "bbb"), createRow(3, "ccc"), createRow(4, "ddd"), createRow(5, "eee"));
}
private DataFile prepareDataFile(List<T> rowSet, FileAppenderFactory<T> appenderFactory) throws IOException {
DataWriter<T> writer = appenderFactory.newDataWriter(createEncryptedOutputFile(), format, parreplacedion);
try (DataWriter<T> closeableWriter = writer) {
for (T row : rowSet) {
closeableWriter.add(row);
}
}
return writer.toDataFile();
}
@Test
public void testDataWriter() throws IOException {
FileAppenderFactory<T> appenderFactory = createAppenderFactory(null, null, null);
List<T> rowSet = testRowSet();
DataFile dataFile = prepareDataFile(rowSet, appenderFactory);
table.newRowDelta().addRows(dataFile).commit();
replacedert.replacedertEquals("Should have the expected records.", expectedRowSet(rowSet), actualRowSet("*"));
}
@Test
public void testEqDeleteWriter() throws IOException {
List<Integer> equalityFieldIds = Lists.newArrayList(table.schema().findField("id").fieldId());
Schema eqDeleteRowSchema = table.schema().select("id");
FileAppenderFactory<T> appenderFactory = createAppenderFactory(equalityFieldIds, eqDeleteRowSchema, null);
List<T> rowSet = testRowSet();
DataFile dataFile = prepareDataFile(rowSet, appenderFactory);
table.newRowDelta().addRows(dataFile).commit();
// The equality field is 'id'. No matter what the value of 'data' field is, we should delete the 1th, 3th, 5th
// rows.
List<T> deletes = Lists.newArrayList(createRow(1, "aaa"), createRow(3, "bbb"), createRow(5, "ccc"));
EncryptedOutputFile out = createEncryptedOutputFile();
EqualityDeleteWriter<T> eqDeleteWriter = appenderFactory.newEqDeleteWriter(out, format, parreplacedion);
try (EqualityDeleteWriter<T> closeableWriter = eqDeleteWriter) {
closeableWriter.deleteAll(deletes);
}
// Check that the delete equality file has the expected equality deletes.
GenericRecord gRecord = GenericRecord.create(eqDeleteRowSchema);
Set<Record> expectedDeletes = Sets.newHashSet(gRecord.copy("id", 1), gRecord.copy("id", 3), gRecord.copy("id", 5));
replacedert.replacedertEquals(expectedDeletes, Sets.newHashSet(createReader(eqDeleteRowSchema, out.encryptingOutputFile().toInputFile())));
table.newRowDelta().addDeletes(eqDeleteWriter.toDeleteFile()).commit();
List<T> expected = Lists.newArrayList(createRow(2, "bbb"), createRow(4, "ddd"));
replacedert.replacedertEquals("Should have the expected records", expectedRowSet(expected), actualRowSet("*"));
}
@Test
public void testPosDeleteWriter() throws IOException {
// Initialize FileAppenderFactory without pos-delete row schema.
FileAppenderFactory<T> appenderFactory = createAppenderFactory(null, null, null);
List<T> rowSet = testRowSet();
DataFile dataFile = prepareDataFile(rowSet, appenderFactory);
List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(Pair.of(dataFile.path(), 0L), Pair.of(dataFile.path(), 2L), Pair.of(dataFile.path(), 4L));
EncryptedOutputFile out = createEncryptedOutputFile();
PositionDeleteWriter<T> eqDeleteWriter = appenderFactory.newPosDeleteWriter(out, format, parreplacedion);
try (PositionDeleteWriter<T> closeableWriter = eqDeleteWriter) {
for (Pair<CharSequence, Long> delete : deletes) {
closeableWriter.delete(delete.first(), delete.second());
}
}
// Check that the pos delete file has the expected pos deletes.
Schema pathPosSchema = DeleteSchemaUtil.pathPosSchema();
GenericRecord gRecord = GenericRecord.create(pathPosSchema);
Set<Record> expectedDeletes = Sets.newHashSet(gRecord.copy("file_path", dataFile.path(), "pos", 0L), gRecord.copy("file_path", dataFile.path(), "pos", 2L), gRecord.copy("file_path", dataFile.path(), "pos", 4L));
replacedert.replacedertEquals(expectedDeletes, Sets.newHashSet(createReader(pathPosSchema, out.encryptingOutputFile().toInputFile())));
table.newRowDelta().addRows(dataFile).addDeletes(eqDeleteWriter.toDeleteFile()).validateDataFilesExist(eqDeleteWriter.referencedDataFiles()).validateDeletedFiles().commit();
List<T> expected = Lists.newArrayList(createRow(2, "bbb"), createRow(4, "ddd"));
replacedert.replacedertEquals("Should have the expected records", expectedRowSet(expected), actualRowSet("*"));
}
@Test
public void testPosDeleteWriterWithRowSchema() throws IOException {
FileAppenderFactory<T> appenderFactory = createAppenderFactory(null, null, table.schema());
List<T> rowSet = testRowSet();
DataFile dataFile = prepareDataFile(rowSet, appenderFactory);
List<PositionDelete<T>> deletes = Lists.newArrayList(new PositionDelete<T>().set(dataFile.path(), 0, rowSet.get(0)), new PositionDelete<T>().set(dataFile.path(), 2, rowSet.get(2)), new PositionDelete<T>().set(dataFile.path(), 4, rowSet.get(4)));
EncryptedOutputFile out = createEncryptedOutputFile();
PositionDeleteWriter<T> eqDeleteWriter = appenderFactory.newPosDeleteWriter(out, format, parreplacedion);
try (PositionDeleteWriter<T> closeableWriter = eqDeleteWriter) {
for (PositionDelete<T> delete : deletes) {
closeableWriter.delete(delete.path(), delete.pos(), delete.row());
}
}
// Check that the pos delete file has the expected pos deletes.
Schema pathPosRowSchema = DeleteSchemaUtil.posDeleteSchema(table.schema());
GenericRecord gRecord = GenericRecord.create(pathPosRowSchema);
GenericRecord rowRecord = GenericRecord.create(table.schema());
Set<Record> expectedDeletes = Sets.newHashSet(gRecord.copy("file_path", dataFile.path(), "pos", 0L, "row", rowRecord.copy("id", 1, "data", "aaa")), gRecord.copy("file_path", dataFile.path(), "pos", 2L, "row", rowRecord.copy("id", 3, "data", "ccc")), gRecord.copy("file_path", dataFile.path(), "pos", 4L, "row", rowRecord.copy("id", 5, "data", "eee")));
replacedert.replacedertEquals(expectedDeletes, Sets.newHashSet(createReader(pathPosRowSchema, out.encryptingOutputFile().toInputFile())));
table.newRowDelta().addRows(dataFile).addDeletes(eqDeleteWriter.toDeleteFile()).validateDataFilesExist(eqDeleteWriter.referencedDataFiles()).validateDeletedFiles().commit();
List<T> expected = Lists.newArrayList(createRow(2, "bbb"), createRow(4, "ddd"));
replacedert.replacedertEquals("Should have the expected records", expectedRowSet(expected), actualRowSet("*"));
}
private CloseableIterable<Record> createReader(Schema schema, InputFile inputFile) {
switch(format) {
case PARQUET:
return Parquet.read(inputFile).project(schema).createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)).build();
case AVRO:
return Avro.read(inputFile).project(schema).createReaderFunc(DataReader::create).build();
default:
throw new UnsupportedOperationException("Unsupported file format: " + format);
}
}
}
17
Source : OutputFileFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
/**
* Factory responsible for generating unique but recognizable data file names.
*/
public clreplaced OutputFileFactory {
private final ParreplacedionSpec spec;
private final FileFormat format;
private final LocationProvider locations;
private final FileIO io;
private final EncryptionManager encryptionManager;
private final int parreplacedionId;
private final long taskId;
// The purpose of this uuid is to be able to know from two paths that they were written by the same operation.
// That's useful, for example, if a Spark job dies and leaves files in the file system, you can identify them all
// with a recursive listing and grep.
private final String operationId;
private final AtomicInteger fileCount = new AtomicInteger(0);
/**
* Constructor where a generated UUID is used as the operationId to ensure uniqueness.
* @param spec Parreplacedion specification used by the location provider
* @param format File format used for the extension
* @param locations Location provider used for generating locations
* @param io FileIO to store the files
* @param encryptionManager Encryption manager used for encrypting the files
* @param parreplacedionId First part of the file name
* @param taskId Second part of the file name
*/
public OutputFileFactory(ParreplacedionSpec spec, FileFormat format, LocationProvider locations, FileIO io, EncryptionManager encryptionManager, int parreplacedionId, long taskId) {
this(spec, format, locations, io, encryptionManager, parreplacedionId, taskId, UUID.randomUUID().toString());
}
/**
* Constructor with specific operationId. The [parreplacedionId, taskId, operationId] triplet has to be unique across JVM
* instances otherwise the same file name could be generated by different instances of the OutputFileFactory.
* @param spec Parreplacedion specification used by the location provider
* @param format File format used for the extension
* @param locations Location provider used for generating locations
* @param io FileIO to store the files
* @param encryptionManager Encryption manager used for encrypting the files
* @param parreplacedionId First part of the file name
* @param taskId Second part of the file name
* @param operationId Third part of the file name
*/
public OutputFileFactory(ParreplacedionSpec spec, FileFormat format, LocationProvider locations, FileIO io, EncryptionManager encryptionManager, int parreplacedionId, long taskId, String operationId) {
this.spec = spec;
this.format = format;
this.locations = locations;
this.io = io;
this.encryptionManager = encryptionManager;
this.parreplacedionId = parreplacedionId;
this.taskId = taskId;
this.operationId = operationId;
}
private String generateFilename() {
return format.addExtension(String.format("%05d-%d-%s-%05d", parreplacedionId, taskId, operationId, fileCount.incrementAndGet()));
}
/**
* Generates EncryptedOutputFile for UnparreplacedionedWriter.
*/
public EncryptedOutputFile newOutputFile() {
OutputFile file = io.newOutputFile(locations.newDataLocation(generateFilename()));
return encryptionManager.encrypt(file);
}
/**
* Generates EncryptedOutputFile for ParreplacedionedWriter.
*/
public EncryptedOutputFile newOutputFile(StructLike parreplacedion) {
String newDataLocation = locations.newDataLocation(spec, parreplacedion, generateFilename());
OutputFile rawOutputFile = io.newOutputFile(newDataLocation);
return encryptionManager.encrypt(rawOutputFile);
}
}
16
Source : Spark3Util.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public static boolean isVectorizationEnabled(FileFormat fileFormat, Map<String, String> properties, RuntimeConfig sessionConf, CaseInsensitiveStringMap readOptions) {
String readOptionValue = readOptions.get(SparkReadOptions.VECTORIZATION_ENABLED);
if (readOptionValue != null) {
return Boolean.parseBoolean(readOptionValue);
}
String sessionConfValue = sessionConf.get("spark.sql.iceberg.vectorization.enabled", null);
if (sessionConfValue != null) {
return Boolean.parseBoolean(sessionConfValue);
}
switch(fileFormat) {
case PARQUET:
return PropertyUtil.propertyAsBoolean(properties, TableProperties.PARQUET_VECTORIZATION_ENABLED, TableProperties.PARQUET_VECTORIZATION_ENABLED_DEFAULT);
case ORC:
return PropertyUtil.propertyAsBoolean(properties, TableProperties.ORC_VECTORIZATION_ENABLED, TableProperties.ORC_VECTORIZATION_ENABLED_DEFAULT);
default:
return false;
}
}
16
Source : TestInputFormatReaderDeletes.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestInputFormatReaderDeletes extends DeleteReadTests {
private final Configuration conf = new Configuration();
private final HadoopTables tables = new HadoopTables(conf);
private TestHelper helper;
// parametrized variables
private final String inputFormat;
private final FileFormat fileFormat;
@Parameterized.Parameters(name = "inputFormat = {0}, fileFormat={1}")
public static Object[][] parameters() {
return new Object[][] { { "IcebergInputFormat", FileFormat.PARQUET }, { "IcebergInputFormat", FileFormat.AVRO }, { "IcebergInputFormat", FileFormat.ORC }, { "MapredIcebergInputFormat", FileFormat.PARQUET }, { "MapredIcebergInputFormat", FileFormat.AVRO }, { "MapredIcebergInputFormat", FileFormat.ORC } };
}
public TestInputFormatReaderDeletes(String inputFormat, FileFormat fileFormat) {
this.inputFormat = inputFormat;
this.fileFormat = fileFormat;
}
@Override
protected Table createTable(String name, Schema schema, ParreplacedionSpec spec) throws IOException {
Table table;
File location = temp.newFolder(inputFormat, fileFormat.name());
replacedert.replacedertTrue(location.delete());
helper = new TestHelper(conf, tables, location.toString(), schema, spec, fileFormat, temp);
table = helper.createTable();
TableOperations ops = ((BaseTable) table).operations();
TableMetadata meta = ops.current();
ops.commit(meta, meta.upgradeToFormatVersion(2));
return table;
}
@Override
protected void dropTable(String name) {
tables.dropTable(helper.table().location());
}
@Override
public StructLikeSet rowSet(String name, Table table, String... columns) {
InputFormatConfig.ConfigBuilder builder = new InputFormatConfig.ConfigBuilder(conf).readFrom(table.location());
Schema projected = table.schema().select(columns);
StructLikeSet set = StructLikeSet.create(projected.replacedtruct());
set.addAll(TestIcebergInputFormats.TESTED_INPUT_FORMATS.stream().filter(recordFactory -> recordFactory.name().equals(inputFormat)).map(recordFactory -> recordFactory.create(builder.project(projected).conf()).getRecords()).flatMap(List::stream).collect(Collectors.toList()));
return set;
}
@Override
protected boolean expectPruned() {
return false;
}
}
16
Source : TestIcebergStreamWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestIcebergStreamWriter {
@Rule
public TemporaryFolder tempFolder = new TemporaryFolder();
private String tablePath;
private Table table;
private final FileFormat format;
private final boolean parreplacedioned;
@Parameterized.Parameters(name = "format = {0}, parreplacedioned = {1}")
public static Object[][] parameters() {
return new Object[][] { { "avro", true }, { "avro", false }, { "orc", true }, { "orc", false }, { "parquet", true }, { "parquet", false } };
}
public TestIcebergStreamWriter(String format, boolean parreplacedioned) {
this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
this.parreplacedioned = parreplacedioned;
}
@Before
public void before() throws IOException {
File folder = tempFolder.newFolder();
tablePath = folder.getAbsolutePath();
// Construct the iceberg table.
Map<String, String> props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name());
table = SimpleDataUtil.createTable(tablePath, props, parreplacedioned);
}
@Test
public void testWritingTable() throws Exception {
long checkpointId = 1L;
try (OneInputStreamOperatorTestHarness<RowData, WriteResult> testHarness = createIcebergStreamWriter()) {
// The first checkpoint
testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1);
testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1);
testHarness.processElement(SimpleDataUtil.createRowData(3, "hello"), 1);
testHarness.prepareSnapshotPreBarrier(checkpointId);
long expectedDataFiles = parreplacedioned ? 2 : 1;
WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(expectedDataFiles, result.dataFiles().length);
checkpointId = checkpointId + 1;
// The second checkpoint
testHarness.processElement(SimpleDataUtil.createRowData(4, "foo"), 1);
testHarness.processElement(SimpleDataUtil.createRowData(5, "bar"), 2);
testHarness.prepareSnapshotPreBarrier(checkpointId);
expectedDataFiles = parreplacedioned ? 4 : 2;
result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(expectedDataFiles, result.dataFiles().length);
// Commit the iceberg transaction.
AppendFiles appendFiles = table.newAppend();
Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);
appendFiles.commit();
// replacedert the table records.
SimpleDataUtil.replacedertTableRecords(tablePath, Lists.newArrayList(SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"), SimpleDataUtil.createRecord(3, "hello"), SimpleDataUtil.createRecord(4, "foo"), SimpleDataUtil.createRecord(5, "bar")));
}
}
@Test
public void testSnapshotTwice() throws Exception {
long checkpointId = 1;
long timestamp = 1;
try (OneInputStreamOperatorTestHarness<RowData, WriteResult> testHarness = createIcebergStreamWriter()) {
testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++);
testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp);
testHarness.prepareSnapshotPreBarrier(checkpointId++);
long expectedDataFiles = parreplacedioned ? 2 : 1;
WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(expectedDataFiles, result.dataFiles().length);
// snapshot again immediately.
for (int i = 0; i < 5; i++) {
testHarness.prepareSnapshotPreBarrier(checkpointId++);
result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(expectedDataFiles, result.dataFiles().length);
}
}
}
@Test
public void testTableWithoutSnapshot() throws Exception {
try (OneInputStreamOperatorTestHarness<RowData, WriteResult> testHarness = createIcebergStreamWriter()) {
replacedert.replacedertEquals(0, testHarness.extractOutputValues().size());
}
// Even if we closed the iceberg stream writer, there's no orphan data file.
replacedert.replacedertEquals(0, scanDataFiles().size());
try (OneInputStreamOperatorTestHarness<RowData, WriteResult> testHarness = createIcebergStreamWriter()) {
testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1);
// Still not emit the data file yet, because there is no checkpoint.
replacedert.replacedertEquals(0, testHarness.extractOutputValues().size());
}
// Once we closed the iceberg stream writer, there will left an orphan data file.
replacedert.replacedertEquals(1, scanDataFiles().size());
}
private Set<String> scanDataFiles() throws IOException {
Path dataDir = new Path(tablePath, "data");
FileSystem fs = FileSystem.get(new Configuration());
if (!fs.exists(dataDir)) {
return ImmutableSet.of();
} else {
Set<String> paths = Sets.newHashSet();
RemoteIterator<LocatedFileStatus> iterators = fs.listFiles(dataDir, true);
while (iterators.hasNext()) {
LocatedFileStatus status = iterators.next();
if (status.isFile()) {
Path path = status.getPath();
if (path.getName().endsWith("." + format.toString().toLowerCase())) {
paths.add(path.toString());
}
}
}
return paths;
}
}
@Test
public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception {
try (OneInputStreamOperatorTestHarness<RowData, WriteResult> testHarness = createIcebergStreamWriter()) {
testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1);
testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2);
replacedert.replacedertTrue(testHarness.getOneInputOperator() instanceof BoundedOneInput);
((BoundedOneInput) testHarness.getOneInputOperator()).endInput();
long expectedDataFiles = parreplacedioned ? 2 : 1;
WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(expectedDataFiles, result.dataFiles().length);
// invoke endInput again.
((BoundedOneInput) testHarness.getOneInputOperator()).endInput();
result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(expectedDataFiles * 2, result.dataFiles().length);
}
}
@Test
public void testTableWithTargetFileSize() throws Exception {
// TODO: ORC file does not support target file size before closed.
if (format == FileFormat.ORC) {
return;
}
// Adjust the target-file-size in table properties.
table.updateProperties().set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, // ~4 bytes; low enough to trigger
"4").commit();
List<RowData> rows = Lists.newArrayListWithCapacity(8000);
List<Record> records = Lists.newArrayListWithCapacity(8000);
for (int i = 0; i < 2000; i++) {
for (String data : new String[] { "a", "b", "c", "d" }) {
rows.add(SimpleDataUtil.createRowData(i, data));
records.add(SimpleDataUtil.createRecord(i, data));
}
}
try (OneInputStreamOperatorTestHarness<RowData, WriteResult> testHarness = createIcebergStreamWriter()) {
for (RowData row : rows) {
testHarness.processElement(row, 1);
}
// snapshot the operator.
testHarness.prepareSnapshotPreBarrier(1);
WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(8, result.dataFiles().length);
// replacedert that the data file have the expected records.
for (DataFile dataFile : result.dataFiles()) {
replacedert.replacedertEquals(1000, dataFile.recordCount());
}
// Commit the iceberg transaction.
AppendFiles appendFiles = table.newAppend();
Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);
appendFiles.commit();
}
// replacedert the table records.
SimpleDataUtil.replacedertTableRecords(tablePath, records);
}
@Test
public void testPromotedFlinkDataType() throws Exception {
Schema iSchema = new Schema(Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), Types.NestedField.required(2, "smallint", Types.IntegerType.get()), Types.NestedField.optional(3, "int", Types.IntegerType.get()));
TableSchema flinkSchema = TableSchema.builder().field("tinyint", DataTypes.TINYINT().notNull()).field("smallint", DataTypes.SMALLINT().notNull()).field("int", DataTypes.INT().nullable()).build();
ParreplacedionSpec spec;
if (parreplacedioned) {
spec = ParreplacedionSpec.builderFor(iSchema).idenreplacedy("smallint").idenreplacedy("tinyint").idenreplacedy("int").build();
} else {
spec = ParreplacedionSpec.unparreplacedioned();
}
String location = tempFolder.newFolder().getAbsolutePath();
Map<String, String> props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name());
Table icebergTable = new HadoopTables().create(iSchema, spec, props, location);
List<RowData> rows = Lists.newArrayList(GenericRowData.of((byte) 0x01, (short) -32768, 101), GenericRowData.of((byte) 0x02, (short) 0, 102), GenericRowData.of((byte) 0x03, (short) 32767, 103));
Record record = GenericRecord.create(iSchema);
List<Record> expected = Lists.newArrayList(record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103)));
try (OneInputStreamOperatorTestHarness<RowData, WriteResult> testHarness = createIcebergStreamWriter(icebergTable, flinkSchema)) {
for (RowData row : rows) {
testHarness.processElement(row, 1);
}
testHarness.prepareSnapshotPreBarrier(1);
WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build();
replacedert.replacedertEquals(0, result.deleteFiles().length);
replacedert.replacedertEquals(parreplacedioned ? 3 : 1, result.dataFiles().length);
// Commit the iceberg transaction.
AppendFiles appendFiles = icebergTable.newAppend();
Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);
appendFiles.commit();
}
SimpleDataUtil.replacedertTableRecords(location, expected);
}
private OneInputStreamOperatorTestHarness<RowData, WriteResult> createIcebergStreamWriter() throws Exception {
return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA);
}
private OneInputStreamOperatorTestHarness<RowData, WriteResult> createIcebergStreamWriter(Table icebergTable, TableSchema flinkSchema) throws Exception {
RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema);
IcebergStreamWriter<RowData> streamWriter = FlinkSink.createStreamWriter(icebergTable, flinkRowType, null);
OneInputStreamOperatorTestHarness<RowData, WriteResult> harness = new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0);
harness.setup();
harness.open();
return harness;
}
}
16
Source : TestRewriteDataFilesAction.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestRewriteDataFilesAction extends FlinkCatalogTestBase {
private static final String TABLE_NAME_UNPARreplacedIONED = "test_table_unparreplacedioned";
private static final String TABLE_NAME_PARreplacedIONED = "test_table_parreplacedioned";
private final FileFormat format;
private Table icebergTableUnParreplacedioned;
private Table icebergTableParreplacedioned;
public TestRewriteDataFilesAction(String catalogName, Namespace baseNamespace, FileFormat format) {
super(catalogName, baseNamespace);
this.format = format;
}
@Override
protected TableEnvironment getTableEnv() {
super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1);
return super.getTableEnv();
}
@Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}")
public static Iterable<Object[]> parameters() {
List<Object[]> parameters = Lists.newArrayList();
for (FileFormat format : new FileFormat[] { FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET }) {
for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) {
String catalogName = (String) catalogParams[0];
Namespace baseNamespace = (Namespace) catalogParams[1];
parameters.add(new Object[] { catalogName, baseNamespace, format });
}
}
return parameters;
}
@Rule
public TemporaryFolder temp = new TemporaryFolder();
@Before
public void before() {
super.before();
sql("CREATE DATABASE %s", flinkDatabase);
sql("USE CATALOG %s", catalogName);
sql("USE %s", DATABASE);
sql("CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", TABLE_NAME_UNPARreplacedIONED, format.name());
icebergTableUnParreplacedioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARreplacedIONED));
sql("CREATE TABLE %s (id int, data varchar,spec varchar) " + " PARreplacedIONED BY (data,spec) with ('write.format.default'='%s')", TABLE_NAME_PARreplacedIONED, format.name());
icebergTableParreplacedioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARreplacedIONED));
}
@After
public void clean() {
sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_UNPARreplacedIONED);
sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_PARreplacedIONED);
sql("DROP DATABASE IF EXISTS %s", flinkDatabase);
super.clean();
}
@Test
public void testRewriteDataFilesEmptyTable() throws Exception {
replacedert.replacedertNull("Table must be empty", icebergTableUnParreplacedioned.currentSnapshot());
Actions.forTable(icebergTableUnParreplacedioned).rewriteDataFiles().execute();
replacedert.replacedertNull("Table must stay empty", icebergTableUnParreplacedioned.currentSnapshot());
}
@Test
public void testRewriteDataFilesUnparreplacedionedTable() throws Exception {
sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARreplacedIONED);
sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_UNPARreplacedIONED);
icebergTableUnParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks = icebergTableUnParreplacedioned.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
replacedert.replacedertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
RewriteDataFilesActionResult result = Actions.forTable(icebergTableUnParreplacedioned).rewriteDataFiles().execute();
replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
icebergTableUnParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks1 = icebergTableUnParreplacedioned.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
replacedert.replacedertEquals("Should have 1 data files after rewrite", 1, dataFiles1.size());
// replacedert the table records as expected.
SimpleDataUtil.replacedertTableRecords(icebergTableUnParreplacedioned, Lists.newArrayList(SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world")));
}
@Test
public void testRewriteDataFilesParreplacedionedTable() throws Exception {
sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARreplacedIONED);
sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARreplacedIONED);
sql("INSERT INTO %s SELECT 3, 'world' ,'b'", TABLE_NAME_PARreplacedIONED);
sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARreplacedIONED);
icebergTableParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks = icebergTableParreplacedioned.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
replacedert.replacedertEquals("Should have 4 data files before rewrite", 4, dataFiles.size());
RewriteDataFilesActionResult result = Actions.forTable(icebergTableParreplacedioned).rewriteDataFiles().execute();
replacedert.replacedertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size());
replacedert.replacedertEquals("Action should add 2 data file", 2, result.addedDataFiles().size());
icebergTableParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks1 = icebergTableParreplacedioned.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
replacedert.replacedertEquals("Should have 2 data files after rewrite", 2, dataFiles1.size());
// replacedert the table records as expected.
Schema schema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get()), Types.NestedField.optional(2, "data", Types.StringType.get()), Types.NestedField.optional(3, "spec", Types.StringType.get()));
Record record = GenericRecord.create(schema);
SimpleDataUtil.replacedertTableRecords(icebergTableParreplacedioned, Lists.newArrayList(record.copy("id", 1, "data", "hello", "spec", "a"), record.copy("id", 2, "data", "hello", "spec", "a"), record.copy("id", 3, "data", "world", "spec", "b"), record.copy("id", 4, "data", "world", "spec", "b")));
}
@Test
public void testRewriteDataFilesWithFilter() throws Exception {
sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARreplacedIONED);
sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARreplacedIONED);
sql("INSERT INTO %s SELECT 3, 'world' ,'a'", TABLE_NAME_PARreplacedIONED);
sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARreplacedIONED);
sql("INSERT INTO %s SELECT 5, 'world' ,'b'", TABLE_NAME_PARreplacedIONED);
icebergTableParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks = icebergTableParreplacedioned.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
replacedert.replacedertEquals("Should have 5 data files before rewrite", 5, dataFiles.size());
RewriteDataFilesActionResult result = Actions.forTable(icebergTableParreplacedioned).rewriteDataFiles().filter(Expressions.equal("spec", "a")).filter(Expressions.startsWith("data", "he")).execute();
replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
icebergTableParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks1 = icebergTableParreplacedioned.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
replacedert.replacedertEquals("Should have 4 data files after rewrite", 4, dataFiles1.size());
// replacedert the table records as expected.
Schema schema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get()), Types.NestedField.optional(2, "data", Types.StringType.get()), Types.NestedField.optional(3, "spec", Types.StringType.get()));
Record record = GenericRecord.create(schema);
SimpleDataUtil.replacedertTableRecords(icebergTableParreplacedioned, Lists.newArrayList(record.copy("id", 1, "data", "hello", "spec", "a"), record.copy("id", 2, "data", "hello", "spec", "a"), record.copy("id", 3, "data", "world", "spec", "a"), record.copy("id", 4, "data", "world", "spec", "b"), record.copy("id", 5, "data", "world", "spec", "b")));
}
@Test
public void testRewriteLargeTableHasResiduals() throws IOException {
// all records belong to the same parreplacedion
List<String> records1 = Lists.newArrayList();
List<String> records2 = Lists.newArrayList();
List<Record> expected = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
int id = i;
String data = String.valueOf(i % 3);
if (i % 2 == 0) {
records1.add("(" + id + ",'" + data + "')");
} else {
records2.add("(" + id + ",'" + data + "')");
}
Record record = RECORD.copy();
record.setField("id", id);
record.setField("data", data);
expected.add(record);
}
sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARreplacedIONED);
sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARreplacedIONED);
icebergTableUnParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks = icebergTableUnParreplacedioned.newScan().ignoreResiduals().filter(Expressions.equal("data", "0")).planFiles();
for (FileScanTask task : tasks) {
replacedert.replacedertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
}
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
replacedert.replacedertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
Actions actions = Actions.forTable(icebergTableUnParreplacedioned);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute();
replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
// replacedert the table records as expected.
SimpleDataUtil.replacedertTableRecords(icebergTableUnParreplacedioned, expected);
}
/**
* a test case to test avoid repeate compress
* <p>
* If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the CombinedScanTask list size
* is 1, so we remove these CombinedScanTasks to avoid compressed repeatedly.
* <p>
* In this test case,we generated 3 data files and set targetSizeInBytes greater than the largest file size so that it
* cannot be combined a CombinedScanTask with other datafiles. The datafile with the largest file size will not be
* compressed.
*
* @throws IOException IOException
*/
@Test
public void testRewriteAvoidRepeateCompress() throws IOException {
replacedume.replacedumeFalse("ORC does not support getting length when file is opening", format.equals(FileFormat.ORC));
List<Record> expected = Lists.newArrayList();
Schema schema = icebergTableUnParreplacedioned.schema();
GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema);
File file = temp.newFile();
int count = 0;
try (FileAppender<Record> fileAppender = genericAppenderFactory.newAppender(Files.localOutput(file), format)) {
long filesize = 20000;
for (; fileAppender.length() < filesize; count++) {
Record record = SimpleDataUtil.createRecord(count, "iceberg");
fileAppender.add(record);
expected.add(record);
}
}
DataFile dataFile = DataFiles.builder(icebergTableUnParreplacedioned.spec()).withPath(file.getAbsolutePath()).withFileSizeInBytes(file.length()).withFormat(format).withRecordCount(count).build();
icebergTableUnParreplacedioned.newAppend().appendFile(dataFile).commit();
sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARreplacedIONED);
sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARreplacedIONED);
icebergTableUnParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks = icebergTableUnParreplacedioned.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
replacedert.replacedertEquals("Should have 3 data files before rewrite", 3, dataFiles.size());
Actions actions = Actions.forTable(icebergTableUnParreplacedioned);
long targetSizeInBytes = file.length() + 10;
RewriteDataFilesActionResult result = actions.rewriteDataFiles().targetSizeInBytes(targetSizeInBytes).splitOpenFileCost(1).execute();
replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
icebergTableUnParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks1 = icebergTableUnParreplacedioned.newScan().planFiles();
List<DataFile> dataFilesRewrote = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
replacedert.replacedertEquals("Should have 2 data files after rewrite", 2, dataFilesRewrote.size());
// the biggest file do not be rewrote
List rewroteDataFileNames = dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList());
replacedert.replacedertTrue(rewroteDataFileNames.contains(file.getAbsolutePath()));
// replacedert the table records as expected.
expected.add(SimpleDataUtil.createRecord(1, "a"));
expected.add(SimpleDataUtil.createRecord(2, "b"));
SimpleDataUtil.replacedertTableRecords(icebergTableUnParreplacedioned, expected);
}
}
16
Source : RowDataRewriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public clreplaced RowDataRewriter {
private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.clreplaced);
private final Schema schema;
private final FileFormat format;
private final String nameMapping;
private final FileIO io;
private final boolean caseSensitive;
private final EncryptionManager encryptionManager;
private final TaskWriterFactory<RowData> taskWriterFactory;
private final String tableName;
public RowDataRewriter(Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) {
this.schema = table.schema();
this.caseSensitive = caseSensitive;
this.io = io;
this.encryptionManager = encryptionManager;
this.nameMapping = PropertyUtil.propertyreplacedtring(table.properties(), DEFAULT_NAME_MAPPING, null);
this.tableName = table.name();
String formatString = PropertyUtil.propertyreplacedtring(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
RowType flinkSchema = FlinkSchemaUtil.convert(table.schema());
this.taskWriterFactory = new RowDataTaskWriterFactory(table.schema(), flinkSchema, table.spec(), table.locationProvider(), io, encryptionManager, Long.MAX_VALUE, format, table.properties(), null);
}
public List<DataFile> rewriteDataForTasks(DataStream<CombinedScanTask> dataStream, int parallelism) throws Exception {
RewriteMap map = new RewriteMap(schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory);
DataStream<List<DataFile>> ds = dataStream.map(map).setParallelism(parallelism);
return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream().flatMap(Collection::stream).collect(Collectors.toList());
}
public static clreplaced RewriteMap extends RichMapFunction<CombinedScanTask, List<DataFile>> {
private TaskWriter<RowData> writer;
private int subTaskId;
private int attemptId;
private final Schema schema;
private final String nameMapping;
private final FileIO io;
private final boolean caseSensitive;
private final EncryptionManager encryptionManager;
private final TaskWriterFactory<RowData> taskWriterFactory;
public RewriteMap(Schema schema, String nameMapping, FileIO io, boolean caseSensitive, EncryptionManager encryptionManager, TaskWriterFactory<RowData> taskWriterFactory) {
this.schema = schema;
this.nameMapping = nameMapping;
this.io = io;
this.caseSensitive = caseSensitive;
this.encryptionManager = encryptionManager;
this.taskWriterFactory = taskWriterFactory;
}
@Override
public void open(Configuration parameters) {
this.subTaskId = getRuntimeContext().getIndexOfThisSubtask();
this.attemptId = getRuntimeContext().getAttemptNumber();
// Initialize the task writer factory.
this.taskWriterFactory.initialize(subTaskId, attemptId);
}
@Override
public List<DataFile> map(CombinedScanTask task) throws Exception {
// Initialize the task writer.
this.writer = taskWriterFactory.create();
try (RowDataIterator iterator = new RowDataIterator(task, io, encryptionManager, schema, schema, nameMapping, caseSensitive)) {
while (iterator.hasNext()) {
RowData rowData = iterator.next();
writer.write(rowData);
}
return Lists.newArrayList(writer.dataFiles());
} catch (Throwable originalThrowable) {
try {
LOG.error("Aborting commit for (subTaskId {}, attemptId {})", subTaskId, attemptId);
writer.abort();
LOG.error("Aborted commit for (subTaskId {}, attemptId {})", subTaskId, attemptId);
} catch (Throwable inner) {
if (originalThrowable != inner) {
originalThrowable.addSuppressed(inner);
LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner);
}
}
if (originalThrowable instanceof Exception) {
throw originalThrowable;
} else {
throw new RuntimeException(originalThrowable);
}
}
}
}
}
16
Source : GenericAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public EqualityDeleteWriter<Record> newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, StructLike parreplacedion) {
Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer");
Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer");
MetricsConfig metricsConfig = MetricsConfig.fromProperties(config);
try {
switch(format) {
case AVRO:
return Avro.writeDeletes(file.encryptingOutputFile()).createWriterFunc(DataWriter::create).withParreplacedion(parreplacedion).overwrite().setAll(config).rowSchema(eqDeleteRowSchema).withSpec(spec).withKeyMetadata(file.keyMetadata()).equalityFieldIds(equalityFieldIds).buildEqualityWriter();
case PARQUET:
return Parquet.writeDeletes(file.encryptingOutputFile()).createWriterFunc(GenericParquetWriter::buildWriter).withParreplacedion(parreplacedion).overwrite().setAll(config).metricsConfig(metricsConfig).rowSchema(eqDeleteRowSchema).withSpec(spec).withKeyMetadata(file.keyMetadata()).equalityFieldIds(equalityFieldIds).buildEqualityWriter();
default:
throw new UnsupportedOperationException("Cannot write equality-deletes for unsupported file format: " + format);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
16
Source : GenericAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public PositionDeleteWriter<Record> newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, StructLike parreplacedion) {
MetricsConfig metricsConfig = MetricsConfig.fromProperties(config);
try {
switch(format) {
case AVRO:
return Avro.writeDeletes(file.encryptingOutputFile()).createWriterFunc(DataWriter::create).withParreplacedion(parreplacedion).overwrite().setAll(config).rowSchema(posDeleteRowSchema).withSpec(spec).withKeyMetadata(file.keyMetadata()).buildPositionWriter();
case PARQUET:
return Parquet.writeDeletes(file.encryptingOutputFile()).createWriterFunc(GenericParquetWriter::buildWriter).withParreplacedion(parreplacedion).overwrite().setAll(config).metricsConfig(metricsConfig).rowSchema(posDeleteRowSchema).withSpec(spec).withKeyMetadata(file.keyMetadata()).buildPositionWriter();
default:
throw new UnsupportedOperationException("Cannot write pos-deletes for unsupported file format: " + format);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
16
Source : DataWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public clreplaced DataWriter<T> implements Closeable {
private final FileAppender<T> appender;
private final FileFormat format;
private final String location;
private final ParreplacedionSpec spec;
private final StructLike parreplacedion;
private final ByteBuffer keyMetadata;
private DataFile dataFile = null;
public DataWriter(FileAppender<T> appender, FileFormat format, String location, ParreplacedionSpec spec, StructLike parreplacedion, EncryptionKeyMetadata keyMetadata) {
this.appender = appender;
this.format = format;
this.location = location;
this.spec = spec;
this.parreplacedion = parreplacedion;
this.keyMetadata = keyMetadata != null ? keyMetadata.buffer() : null;
}
public void add(T row) {
appender.add(row);
}
public long length() {
return appender.length();
}
@Override
public void close() throws IOException {
if (dataFile == null) {
appender.close();
this.dataFile = DataFiles.builder(spec).withFormat(format).withPath(location).withParreplacedion(parreplacedion).withEncryptionKeyMetadata(keyMetadata).withFileSizeInBytes(appender.length()).withMetrics(appender.metrics()).withSplitOffsets(appender.splitOffsets()).build();
}
}
public DataFile toDataFile() {
Preconditions.checkState(dataFile != null, "Cannot create data file from unclosed writer");
return dataFile;
}
}
16
Source : BaseTaskWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public abstract clreplaced BaseTaskWriter<T> implements TaskWriter<T> {
private final List<DataFile> completedDataFiles = Lists.newArrayList();
private final List<DeleteFile> completedDeleteFiles = Lists.newArrayList();
private final Set<CharSequence> referencedDataFiles = CharSequenceSet.empty();
private final ParreplacedionSpec spec;
private final FileFormat format;
private final FileAppenderFactory<T> appenderFactory;
private final OutputFileFactory fileFactory;
private final FileIO io;
private final long targetFileSize;
protected BaseTaskWriter(ParreplacedionSpec spec, FileFormat format, FileAppenderFactory<T> appenderFactory, OutputFileFactory fileFactory, FileIO io, long targetFileSize) {
this.spec = spec;
this.format = format;
this.appenderFactory = appenderFactory;
this.fileFactory = fileFactory;
this.io = io;
this.targetFileSize = targetFileSize;
}
protected ParreplacedionSpec spec() {
return spec;
}
@Override
public void abort() throws IOException {
close();
// clean up files created by this writer
Tasks.foreach(Iterables.concat(completedDataFiles, completedDeleteFiles)).throwFailureWhenFinished().noRetry().run(file -> io.deleteFile(file.path().toString()));
}
@Override
public WriteResult complete() throws IOException {
close();
return WriteResult.builder().addDataFiles(completedDataFiles).addDeleteFiles(completedDeleteFiles).addReferencedDataFiles(referencedDataFiles).build();
}
/**
* Base equality delta writer to write both insert records and equality-deletes.
*/
protected abstract clreplaced BaseEqualityDeltaWriter implements Closeable {
private final StructProjection structProjection;
private RollingFileWriter dataWriter;
private RollingEqDeleteWriter eqDeleteWriter;
private SortedPosDeleteWriter<T> posDeleteWriter;
private Map<StructLike, PathOffset> insertedRowMap;
protected BaseEqualityDeltaWriter(StructLike parreplacedion, Schema schema, Schema deleteSchema) {
Preconditions.checkNotNull(schema, "Iceberg table schema cannot be null.");
Preconditions.checkNotNull(deleteSchema, "Equality-delete schema cannot be null.");
this.structProjection = StructProjection.create(schema, deleteSchema);
this.dataWriter = new RollingFileWriter(parreplacedion);
this.eqDeleteWriter = new RollingEqDeleteWriter(parreplacedion);
this.posDeleteWriter = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, parreplacedion);
this.insertedRowMap = StructLikeMap.create(deleteSchema.replacedtruct());
}
/**
* Wrap the data as a {@link StructLike}.
*/
protected abstract StructLike replacedtructLike(T data);
public void write(T row) throws IOException {
PathOffset pathOffset = PathOffset.of(dataWriter.currentPath(), dataWriter.currentRows());
// Create a copied key from this row.
StructLike copiedKey = StructCopy.copy(structProjection.wrap(replacedtructLike(row)));
// Adding a pos-delete to replace the old path-offset.
PathOffset previous = insertedRowMap.put(copiedKey, pathOffset);
if (previous != null) {
// TODO attach the previous row if has a positional-delete row schema in appender factory.
posDeleteWriter.delete(previous.path, previous.rowOffset, null);
}
dataWriter.write(row);
}
/**
* Write the pos-delete if there's an existing row matching the given key.
*
* @param key has the same columns with the equality fields.
*/
private void internalPosDelete(StructLike key) {
PathOffset previous = insertedRowMap.remove(key);
if (previous != null) {
// TODO attach the previous row if has a positional-delete row schema in appender factory.
posDeleteWriter.delete(previous.path, previous.rowOffset, null);
}
}
/**
* Delete those rows whose equality fields has the same values with the given row. It will write the entire row into
* the equality-delete file.
*
* @param row the given row to delete.
*/
public void delete(T row) throws IOException {
internalPosDelete(structProjection.wrap(replacedtructLike(row)));
eqDeleteWriter.write(row);
}
/**
* Delete those rows with the given key. It will only write the values of equality fields into the equality-delete
* file.
*
* @param key is the projected data whose columns are the same as the equality fields.
*/
public void deleteKey(T key) throws IOException {
internalPosDelete(replacedtructLike(key));
eqDeleteWriter.write(key);
}
@Override
public void close() throws IOException {
// Close data writer and add completed data files.
if (dataWriter != null) {
dataWriter.close();
dataWriter = null;
}
// Close eq-delete writer and add completed equality-delete files.
if (eqDeleteWriter != null) {
eqDeleteWriter.close();
eqDeleteWriter = null;
}
if (insertedRowMap != null) {
insertedRowMap.clear();
insertedRowMap = null;
}
// Add the completed pos-delete files.
if (posDeleteWriter != null) {
completedDeleteFiles.addAll(posDeleteWriter.complete());
referencedDataFiles.addAll(posDeleteWriter.referencedDataFiles());
posDeleteWriter = null;
}
}
}
private static clreplaced PathOffset {
private final CharSequence path;
private final long rowOffset;
private PathOffset(CharSequence path, long rowOffset) {
this.path = path;
this.rowOffset = rowOffset;
}
private static PathOffset of(CharSequence path, long rowOffset) {
return new PathOffset(path, rowOffset);
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this).add("path", path).add("row_offset", rowOffset).toString();
}
}
private abstract clreplaced BaseRollingWriter<W extends Closeable> implements Closeable {
private static final int ROWS_DIVISOR = 1000;
private final StructLike parreplacedionKey;
private EncryptedOutputFile currentFile = null;
private W currentWriter = null;
private long currentRows = 0;
private BaseRollingWriter(StructLike parreplacedionKey) {
this.parreplacedionKey = parreplacedionKey;
openCurrent();
}
abstract W newWriter(EncryptedOutputFile file, StructLike parreplacedion);
abstract long length(W writer);
abstract void write(W writer, T record);
abstract void complete(W closedWriter);
public void write(T record) throws IOException {
write(currentWriter, record);
this.currentRows++;
if (shouldRollToNewFile()) {
closeCurrent();
openCurrent();
}
}
public CharSequence currentPath() {
Preconditions.checkNotNull(currentFile, "The currentFile shouldn't be null");
return currentFile.encryptingOutputFile().location();
}
public long currentRows() {
return currentRows;
}
private void openCurrent() {
if (parreplacedionKey == null) {
// unparreplacedioned
this.currentFile = fileFactory.newOutputFile();
} else {
// parreplacedioned
this.currentFile = fileFactory.newOutputFile(parreplacedionKey);
}
this.currentWriter = newWriter(currentFile, parreplacedionKey);
this.currentRows = 0;
}
private boolean shouldRollToNewFile() {
// TODO: ORC file now not support target file size before closed
return !format.equals(FileFormat.ORC) && currentRows % ROWS_DIVISOR == 0 && length(currentWriter) >= targetFileSize;
}
private void closeCurrent() throws IOException {
if (currentWriter != null) {
currentWriter.close();
if (currentRows == 0L) {
io.deleteFile(currentFile.encryptingOutputFile());
} else {
complete(currentWriter);
}
this.currentFile = null;
this.currentWriter = null;
this.currentRows = 0;
}
}
@Override
public void close() throws IOException {
closeCurrent();
}
}
protected clreplaced RollingFileWriter extends BaseRollingWriter<DataWriter<T>> {
public RollingFileWriter(StructLike parreplacedionKey) {
super(parreplacedionKey);
}
@Override
DataWriter<T> newWriter(EncryptedOutputFile file, StructLike parreplacedionKey) {
return appenderFactory.newDataWriter(file, format, parreplacedionKey);
}
@Override
long length(DataWriter<T> writer) {
return writer.length();
}
@Override
void write(DataWriter<T> writer, T record) {
writer.add(record);
}
@Override
void complete(DataWriter<T> closedWriter) {
completedDataFiles.add(closedWriter.toDataFile());
}
}
protected clreplaced RollingEqDeleteWriter extends BaseRollingWriter<EqualityDeleteWriter<T>> {
RollingEqDeleteWriter(StructLike parreplacedionKey) {
super(parreplacedionKey);
}
@Override
EqualityDeleteWriter<T> newWriter(EncryptedOutputFile file, StructLike parreplacedionKey) {
return appenderFactory.newEqDeleteWriter(file, format, parreplacedionKey);
}
@Override
long length(EqualityDeleteWriter<T> writer) {
return writer.length();
}
@Override
void write(EqualityDeleteWriter<T> writer, T record) {
writer.delete(record);
}
@Override
void complete(EqualityDeleteWriter<T> closedWriter) {
completedDeleteFiles.add(closedWriter.toDeleteFile());
}
}
}
16
Source : EqualityDeleteWriter.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public clreplaced EqualityDeleteWriter<T> implements Closeable {
private final FileAppender<T> appender;
private final FileFormat format;
private final String location;
private final ParreplacedionSpec spec;
private final StructLike parreplacedion;
private final ByteBuffer keyMetadata;
private final int[] equalityFieldIds;
private DeleteFile deleteFile = null;
public EqualityDeleteWriter(FileAppender<T> appender, FileFormat format, String location, ParreplacedionSpec spec, StructLike parreplacedion, EncryptionKeyMetadata keyMetadata, int... equalityFieldIds) {
this.appender = appender;
this.format = format;
this.location = location;
this.spec = spec;
this.parreplacedion = parreplacedion;
this.keyMetadata = keyMetadata != null ? keyMetadata.buffer() : null;
this.equalityFieldIds = equalityFieldIds;
}
public void deleteAll(Iterable<T> rows) {
appender.addAll(rows);
}
public void delete(T row) {
appender.add(row);
}
public long length() {
return appender.length();
}
@Override
public void close() throws IOException {
if (deleteFile == null) {
appender.close();
this.deleteFile = FileMetadata.deleteFileBuilder(spec).ofEqualityDeletes(equalityFieldIds).withFormat(format).withPath(location).withParreplacedion(parreplacedion).withEncryptionKeyMetadata(keyMetadata).withFileSizeInBytes(appender.length()).withMetrics(appender.metrics()).build();
}
}
public DeleteFile toDeleteFile() {
Preconditions.checkState(deleteFile != null, "Cannot create delete file from unclosed writer");
return deleteFile;
}
}
15
Source : IcebergQueryRunner.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public static DistributedQueryRunner createIcebergQueryRunner(Map<String, String> extraProperties, FileFormat format, List<TpchTable<?>> tables) throws Exception {
Session session = testSessionBuilder().setCatalog(ICEBERG_CATALOG).setSchema("tpch").build();
DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(session).setExtraProperties(extraProperties).build();
queryRunner.installPlugin(new TpchPlugin());
queryRunner.createCatalog("tpch", "tpch");
Path dataDir = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data");
queryRunner.installPlugin(new IcebergPlugin());
Map<String, String> icebergProperties = ImmutableMap.<String, String>builder().put("hive.metastore", "file").put("hive.metastore.catalog.dir", dataDir.toString()).put("iceberg.file-format", format.name()).build();
queryRunner.createCatalog(ICEBERG_CATALOG, "iceberg", icebergProperties);
queryRunner.execute("CREATE SCHEMA tpch");
copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, session, tables);
return queryRunner;
}
15
Source : TestHelpers.java
with Apache License 2.0
from ExpediaGroup
with Apache License 2.0
from ExpediaGroup
public static DataFile writeFile(File targetFile, Table table, StructLike parreplacedionData, FileFormat fileFormat, List<Record> records) throws IOException {
if (targetFile.exists()) {
if (!targetFile.delete()) {
throw new IOException("Unable to delete " + targetFile.getAbsolutePath());
}
}
FileAppender<Record> appender;
switch(fileFormat) {
case AVRO:
appender = Avro.write(Files.localOutput(targetFile)).schema(table.schema()).createWriterFunc(DataWriter::create).named(fileFormat.name()).build();
break;
case PARQUET:
appender = Parquet.write(Files.localOutput(targetFile)).schema(table.schema()).createWriterFunc(GenericParquetWriter::buildWriter).named(fileFormat.name()).build();
break;
case ORC:
appender = ORC.write(Files.localOutput(targetFile)).schema(table.schema()).createWriterFunc(GenericOrcWriter::buildWriter).build();
break;
default:
throw new UnsupportedOperationException("Cannot write format: " + fileFormat);
}
try {
appender.addAll(records);
} finally {
appender.close();
}
DataFiles.Builder builder = DataFiles.builder(table.spec()).withPath(targetFile.toString()).withFormat(fileFormat).withFileSizeInBytes(targetFile.length()).withMetrics(appender.metrics());
if (parreplacedionData != null) {
builder.withParreplacedion(parreplacedionData);
}
return builder.build();
}
15
Source : TestSparkTableUtil.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
static void loadData(FileFormat fileFormat) {
// Create a hive table.
SQLContext sc = new SQLContext(TestSparkTableUtil.spark);
sc.sql(String.format("CREATE TABLE %s (\n" + " id int COMMENT 'unique id'\n" + ")\n" + "PARreplacedIONED BY (data string)\n" + "STORED AS %s\n" + "LOCATION '%s'", QUALIFIED_TABLE_NAME, fileFormat, TABLE_LOCATION_STR));
List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
df.select("id", "data").orderBy("data").write().mode("append").insertInto(QUALIFIED_TABLE_NAME);
}
15
Source : TestSparkDataWrite.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public abstract clreplaced TestSparkDataWrite {
private static final Configuration CONF = new Configuration();
private final FileFormat format;
private static SparkSession spark = null;
private static final Schema SCHEMA = new Schema(optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()));
@Rule
public TemporaryFolder temp = new TemporaryFolder();
@Parameterized.Parameters(name = "format = {0}")
public static Object[] parameters() {
return new Object[] { "parquet", "avro", "orc" };
}
@BeforeClreplaced
public static void startSpark() {
TestSparkDataWrite.spark = SparkSession.builder().master("local[2]").getOrCreate();
}
@AfterClreplaced
public static void stopSpark() {
SparkSession currentSpark = TestSparkDataWrite.spark;
TestSparkDataWrite.spark = null;
currentSpark.stop();
}
public TestSparkDataWrite(String format) {
this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
}
@Test
public void testBasicWrite() throws IOException {
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();
Table table = tables.create(SCHEMA, spec, location.toString());
List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
// TODO: incoming columns must be ordered according to the table's schema
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
for (ManifestFile manifest : table.currentSnapshot().allManifests()) {
for (DataFile file : ManifestFiles.read(manifest, table.io())) {
// TODO: avro not support split
if (!format.equals(FileFormat.AVRO)) {
replacedert.replacedertNotNull("Split offsets not present", file.splitOffsets());
}
replacedert.replacedertEquals("Should have reported record count as 1", 1, file.recordCount());
// TODO: append more metric info
if (format.equals(FileFormat.PARQUET)) {
replacedert.replacedertNotNull("Column sizes metric not present", file.columnSizes());
replacedert.replacedertNotNull("Counts metric not present", file.valueCounts());
replacedert.replacedertNotNull("Null value counts metric not present", file.nullValueCounts());
replacedert.replacedertNotNull("Lower bounds metric not present", file.lowerBounds());
replacedert.replacedertNotNull("Upper bounds metric not present", file.upperBounds());
}
}
}
}
@Test
public void testAppend() throws IOException {
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();
Table table = tables.create(SCHEMA, spec, location.toString());
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"), new SimpleRecord(4, "a"), new SimpleRecord(5, "b"), new SimpleRecord(6, "c"));
Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.clreplaced);
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
df.withColumn("id", df.col("id").plus(3)).select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
}
@Test
public void testOverwrite() throws IOException {
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("id").build();
Table table = tables.create(SCHEMA, spec, location.toString());
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a"), new SimpleRecord(3, "c"), new SimpleRecord(4, "b"), new SimpleRecord(6, "c"));
Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.clreplaced);
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
// overwrite with 2*id to replace record 2, append 4 and 6
df.withColumn("id", df.col("id").multiply(2)).select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Overwrite).option("overwrite-mode", "dynamic").save(location.toString());
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
}
@Test
public void testUnparreplacedionedOverwrite() throws IOException {
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Table table = tables.create(SCHEMA, spec, location.toString());
List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
// overwrite with the same data; should not produce two copies
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Overwrite).save(location.toString());
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
}
@Test
public void testUnparreplacedionedCreateWithTargetFileSizeViaTableProperties() throws IOException {
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Table table = tables.create(SCHEMA, spec, location.toString());
table.updateProperties().set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, // ~4 bytes; low enough to trigger
"4").commit();
List<SimpleRecord> expected = Lists.newArrayListWithCapacity(4000);
for (int i = 0; i < 4000; i++) {
expected.add(new SimpleRecord(i, "a"));
}
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
List<DataFile> files = Lists.newArrayList();
for (ManifestFile manifest : table.currentSnapshot().allManifests()) {
for (DataFile file : ManifestFiles.read(manifest, table.io())) {
files.add(file);
}
}
// TODO: ORC file now not support target file size
if (!format.equals(FileFormat.ORC)) {
replacedert.replacedertEquals("Should have 4 DataFiles", 4, files.size());
replacedert.replacedertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000));
}
}
@Test
public void testParreplacedionedCreateWithTargetFileSizeViaOption() throws IOException {
parreplacedionedCreateWithTargetFileSizeViaOption(IcebergOptionsType.NONE);
}
@Test
public void testParreplacedionedFanoutCreateWithTargetFileSizeViaOption() throws IOException {
parreplacedionedCreateWithTargetFileSizeViaOption(IcebergOptionsType.TABLE);
}
@Test
public void testParreplacedionedFanoutCreateWithTargetFileSizeViaOption2() throws IOException {
parreplacedionedCreateWithTargetFileSizeViaOption(IcebergOptionsType.JOB);
}
@Test
public void testWriteProjection() throws IOException {
replacedume.replacedumeTrue("Not supported in Spark 3.0; replacedysis requires all columns are present", spark.version().startsWith("2"));
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Table table = tables.create(SCHEMA, spec, location.toString());
List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null));
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
// select only id column
df.select("id").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
}
@Test
public void testWriteProjectionWithMiddle() throws IOException {
replacedume.replacedumeTrue("Not supported in Spark 3.0; replacedysis requires all columns are present", spark.version().startsWith("2"));
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Schema schema = new Schema(optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()));
Table table = tables.create(schema, spec, location.toString());
List<ThreeColumnRecord> expected = Lists.newArrayList(new ThreeColumnRecord(1, null, "hello"), new ThreeColumnRecord(2, null, "world"), new ThreeColumnRecord(3, null, null));
Dataset<Row> df = spark.createDataFrame(expected, ThreeColumnRecord.clreplaced);
df.select("c1", "c3").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<ThreeColumnRecord> actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
}
@Test
public void testViewsReturnRecentResults() throws IOException {
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();
tables.create(SCHEMA, spec, location.toString());
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.clreplaced);
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
Dataset<Row> query = spark.read().format("iceberg").load(location.toString()).where("id = 1");
query.createOrReplaceTempView("tmp");
List<SimpleRecord> actual1 = spark.table("tmp").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
List<SimpleRecord> expected1 = Lists.newArrayList(new SimpleRecord(1, "a"));
replacedert.replacedertEquals("Number of rows should match", expected1.size(), actual1.size());
replacedert.replacedertEquals("Result rows should match", expected1, actual1);
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).save(location.toString());
List<SimpleRecord> actual2 = spark.table("tmp").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
List<SimpleRecord> expected2 = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a"));
replacedert.replacedertEquals("Number of rows should match", expected2.size(), actual2.size());
replacedert.replacedertEquals("Result rows should match", expected2, actual2);
}
public void parreplacedionedCreateWithTargetFileSizeViaOption(IcebergOptionsType option) throws IOException {
File parent = temp.newFolder(format.toString());
File location = new File(parent, "test");
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).idenreplacedy("data").build();
Table table = tables.create(SCHEMA, spec, location.toString());
List<SimpleRecord> expected = Lists.newArrayListWithCapacity(8000);
for (int i = 0; i < 2000; i++) {
expected.add(new SimpleRecord(i, "a"));
expected.add(new SimpleRecord(i, "b"));
expected.add(new SimpleRecord(i, "c"));
expected.add(new SimpleRecord(i, "d"));
}
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.clreplaced);
switch(option) {
case NONE:
df.select("id", "data").sort("data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, // ~4 bytes; low enough to trigger
4).save(location.toString());
break;
case TABLE:
table.updateProperties().set(SPARK_WRITE_PARreplacedIONED_FANOUT_ENABLED, "true").commit();
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, // ~4 bytes; low enough to trigger
4).save(location.toString());
break;
case JOB:
df.select("id", "data").write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, format.toString()).mode(SaveMode.Append).option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, // ~4 bytes; low enough to trigger
4).option(SparkWriteOptions.FANOUT_ENABLED, true).save(location.toString());
break;
default:
break;
}
table.refresh();
Dataset<Row> result = spark.read().format("iceberg").load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Number of rows should match", expected.size(), actual.size());
replacedert.replacedertEquals("Result rows should match", expected, actual);
List<DataFile> files = Lists.newArrayList();
for (ManifestFile manifest : table.currentSnapshot().allManifests()) {
for (DataFile file : ManifestFiles.read(manifest, table.io())) {
files.add(file);
}
}
// TODO: ORC file now not support target file size
if (!format.equals(FileFormat.ORC)) {
replacedert.replacedertEquals("Should have 8 DataFiles", 8, files.size());
replacedert.replacedertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000));
}
}
public enum IcebergOptionsType {
NONE, TABLE, JOB
}
}
15
Source : SparkAppenderFactory.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public PositionDeleteWriter<InternalRow> newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, StructLike parreplacedion) {
try {
switch(format) {
case PARQUET:
StructType sparkPosDeleteSchema = SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema));
return Parquet.writeDeletes(file.encryptingOutputFile()).createWriterFunc(msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)).overwrite().rowSchema(posDeleteRowSchema).withSpec(spec).withParreplacedion(parreplacedion).withKeyMetadata(file.keyMetadata()).transformPaths(path -> UTF8String.fromString(path.toString())).buildPositionWriter();
case AVRO:
return Avro.writeDeletes(file.encryptingOutputFile()).createWriterFunc(ignored -> new SparkAvroWriter(lazyPosDeleteSparkType())).overwrite().rowSchema(posDeleteRowSchema).withSpec(spec).withParreplacedion(parreplacedion).withKeyMetadata(file.keyMetadata()).buildPositionWriter();
default:
throw new UnsupportedOperationException("Cannot write pos-deletes for unsupported file format: " + format);
}
} catch (IOException e) {
throw new UncheckedIOException("Failed to create new equality delete writer", e);
}
}
15
Source : TestHiveIcebergStorageHandlerLocalScan.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public clreplaced TestHiveIcebergStorageHandlerLocalScan {
@Parameters(name = "fileFormat={0}, catalog={1}")
public static Collection<Object[]> parameters() {
Collection<Object[]> testParams = new ArrayList<>();
// Run tests with every FileFormat for a single Catalog (HiveCatalog)
for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) {
testParams.add(new Object[] { fileFormat, TestTables.TestTableType.HIVE_CATALOG });
}
// Run tests for every Catalog for a single FileFormat (PARQUET) - skip HiveCatalog tests as they are added before
for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) {
if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) {
testParams.add(new Object[] { FileFormat.PARQUET, testTableType });
}
}
return testParams;
}
private static TestHiveShell shell;
private TestTables testTables;
@Parameter(0)
public FileFormat fileFormat;
@Parameter(1)
public TestTables.TestTableType testTableType;
@Rule
public TemporaryFolder temp = new TemporaryFolder();
@BeforeClreplaced
public static void beforeClreplaced() {
shell = HiveIcebergStorageHandlerTestUtils.shell();
}
@AfterClreplaced
public static void afterClreplaced() {
shell.stop();
}
@Before
public void before() throws IOException {
testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp);
// Uses spark as an engine so we can detect if we unintentionally try to use any execution engines
HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "spark");
}
@After
public void after() throws Exception {
HiveIcebergStorageHandlerTestUtils.close(shell);
}
@Test
public void testScanEmptyTable() throws IOException {
Schema emptySchema = new Schema(required(1, "empty", Types.StringType.get()));
testTables.createTable(shell, "empty", emptySchema, fileFormat, ImmutableList.of());
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.empty");
replacedert.replacedertEquals(0, rows.size());
}
@Test
public void testScanTable() throws IOException {
testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
// Single fetch task: no MR job.
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
replacedert.replacedertEquals(3, rows.size());
replacedert.replacedertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, rows.get(0));
replacedert.replacedertArrayEquals(new Object[] { 1L, "Bob", "Green" }, rows.get(1));
replacedert.replacedertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, rows.get(2));
}
@Test
public void testScanTableCaseInsensitive() throws IOException {
testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA_WITH_UPPERCASE, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
replacedert.replacedertEquals(3, rows.size());
replacedert.replacedertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, rows.get(0));
replacedert.replacedertArrayEquals(new Object[] { 1L, "Bob", "Green" }, rows.get(1));
replacedert.replacedertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, rows.get(2));
rows = shell.executeStatement("SELECT * FROM default.customers where CustomER_Id < 2 " + "and first_name in ('Alice', 'Bob')");
replacedert.replacedertEquals(2, rows.size());
replacedert.replacedertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, rows.get(0));
replacedert.replacedertArrayEquals(new Object[] { 1L, "Bob", "Green" }, rows.get(1));
}
@Test
public void testDecimalTableWithPredicateLiterals() throws IOException {
Schema schema = new Schema(required(1, "decimal_field", Types.DecimalType.of(7, 2)));
List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(new BigDecimal("85.00")).add(new BigDecimal("100.56")).add(new BigDecimal("100.57")).build();
testTables.createTable(shell, "dec_test", schema, fileFormat, records);
// Use integer literal in predicate
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field >= 85");
replacedert.replacedertEquals(3, rows.size());
replacedert.replacedertArrayEquals(new Object[] { "85.00" }, rows.get(0));
replacedert.replacedertArrayEquals(new Object[] { "100.56" }, rows.get(1));
replacedert.replacedertArrayEquals(new Object[] { "100.57" }, rows.get(2));
// Use decimal literal in predicate with smaller scale than schema type definition
rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field > 99.1");
replacedert.replacedertEquals(2, rows.size());
replacedert.replacedertArrayEquals(new Object[] { "100.56" }, rows.get(0));
replacedert.replacedertArrayEquals(new Object[] { "100.57" }, rows.get(1));
// Use decimal literal in predicate with higher scale than schema type definition
rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field > 100.565");
replacedert.replacedertEquals(1, rows.size());
replacedert.replacedertArrayEquals(new Object[] { "100.57" }, rows.get(0));
// Use decimal literal in predicate with the same scale as schema type definition
rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field > 640.34");
replacedert.replacedertEquals(0, rows.size());
}
@Test
public void testColumnSelection() throws IOException {
testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
List<Object[]> outOfOrderColumns = shell.executeStatement("SELECT first_name, customer_id, last_name FROM default.customers");
replacedert.replacedertEquals(3, outOfOrderColumns.size());
replacedert.replacedertArrayEquals(new Object[] { "Alice", 0L, "Brown" }, outOfOrderColumns.get(0));
replacedert.replacedertArrayEquals(new Object[] { "Bob", 1L, "Green" }, outOfOrderColumns.get(1));
replacedert.replacedertArrayEquals(new Object[] { "Trudy", 2L, "Pink" }, outOfOrderColumns.get(2));
List<Object[]> allButFirstColumn = shell.executeStatement("SELECT first_name, last_name FROM default.customers");
replacedert.replacedertEquals(3, allButFirstColumn.size());
replacedert.replacedertArrayEquals(new Object[] { "Alice", "Brown" }, allButFirstColumn.get(0));
replacedert.replacedertArrayEquals(new Object[] { "Bob", "Green" }, allButFirstColumn.get(1));
replacedert.replacedertArrayEquals(new Object[] { "Trudy", "Pink" }, allButFirstColumn.get(2));
List<Object[]> allButMiddleColumn = shell.executeStatement("SELECT customer_id, last_name FROM default.customers");
replacedert.replacedertEquals(3, allButMiddleColumn.size());
replacedert.replacedertArrayEquals(new Object[] { 0L, "Brown" }, allButMiddleColumn.get(0));
replacedert.replacedertArrayEquals(new Object[] { 1L, "Green" }, allButMiddleColumn.get(1));
replacedert.replacedertArrayEquals(new Object[] { 2L, "Pink" }, allButMiddleColumn.get(2));
List<Object[]> allButLastColumn = shell.executeStatement("SELECT customer_id, first_name FROM default.customers");
replacedert.replacedertEquals(3, allButLastColumn.size());
replacedert.replacedertArrayEquals(new Object[] { 0L, "Alice" }, allButLastColumn.get(0));
replacedert.replacedertArrayEquals(new Object[] { 1L, "Bob" }, allButLastColumn.get(1));
replacedert.replacedertArrayEquals(new Object[] { 2L, "Trudy" }, allButLastColumn.get(2));
}
@Test
public void selectSameColumnTwice() throws IOException {
testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
List<Object[]> columns = shell.executeStatement("SELECT first_name, first_name FROM default.customers");
replacedert.replacedertEquals(3, columns.size());
replacedert.replacedertArrayEquals(new Object[] { "Alice", "Alice" }, columns.get(0));
replacedert.replacedertArrayEquals(new Object[] { "Bob", "Bob" }, columns.get(1));
replacedert.replacedertArrayEquals(new Object[] { "Trudy", "Trudy" }, columns.get(2));
}
@Test
public void testCreateTableWithColumnSpecification() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
Map<StructLike, List<Record>> data = new HashMap<>(1);
data.put(null, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name')" + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + testTables.locationForCreateTableSQL(identifier);
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, ParreplacedionSpec.unparreplacedioned(), data);
}
@Test
public void testCreateTableWithColumnSpecificationParreplacedioned() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).idenreplacedy("last_name").build();
Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name') " + "PARreplacedIONED BY (last_name STRING COMMENT 'This is last name') STORED BY " + "'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + testTables.locationForCreateTableSQL(identifier);
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
@Test
public void testCreateParreplacedionedTableByProperty() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).idenreplacedy("last_name").build();
Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
String createSql = "CREATE EXTERNAL TABLE " + identifier + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + testTables.locationForCreateTableSQL(identifier) + "TBLPROPERTIES ('" + InputFormatConfig.PARreplacedION_SPEC + "'='" + ParreplacedionSpecParser.toJson(spec) + "', " + "'" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "')";
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
@Test
public void testCreateTableWithColumnSpecificationMultilevelParreplacedioned() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).idenreplacedy("first_name").idenreplacedy("last_name").build();
Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Alice", "Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Bob", "Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Trudy", "Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT) " + "PARreplacedIONED BY (first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name') " + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + testTables.locationForCreateTableSQL(identifier);
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
@Test
public void testArrayOfPrimitivesInTable() throws IOException {
Schema schema = new Schema(required(1, "arrayofprimitives", Types.ListType.ofRequired(2, Types.IntegerType.get())));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1);
// access a single element from the array
for (int i = 0; i < records.size(); i++) {
List<?> expectedList = (List<?>) records.get(i).getField("arrayofprimitives");
for (int j = 0; j < expectedList.size(); j++) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT arrayofprimitives[%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", j, i));
replacedert.replacedertEquals(expectedList.get(j), queryResult.get(0)[0]);
}
}
}
@Test
public void testArrayOfArraysInTable() throws IOException {
Schema schema = new Schema(required(1, "arrayofarrays", Types.ListType.ofRequired(2, Types.ListType.ofRequired(3, Types.DateType.get()))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1);
// access an element from a matrix
for (int i = 0; i < records.size(); i++) {
List<?> expectedList = (List<?>) records.get(i).getField("arrayofarrays");
for (int j = 0; j < expectedList.size(); j++) {
List<?> expectedInnerList = (List<?>) expectedList.get(j);
for (int k = 0; k < expectedInnerList.size(); k++) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT arrayofarrays[%d][%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", j, k, i));
replacedert.replacedertEquals(expectedInnerList.get(k).toString(), queryResult.get(0)[0]);
}
}
}
}
@Test
public void testArrayOfMapsInTable() throws IOException {
Schema schema = new Schema(required(1, "arrayofmaps", Types.ListType.ofRequired(2, Types.MapType.ofRequired(3, 4, Types.StringType.get(), Types.BooleanType.get()))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1);
// access an element from a map in an array
for (int i = 0; i < records.size(); i++) {
List<?> expectedList = (List<?>) records.get(i).getField("arrayofmaps");
for (int j = 0; j < expectedList.size(); j++) {
Map<?, ?> expectedMap = (Map<?, ?>) expectedList.get(j);
for (Map.Entry<?, ?> entry : expectedMap.entrySet()) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT arrayofmaps[%d][\"%s\"] FROM default.arraytable LIMIT 1 OFFSET %d", j, entry.getKey(), i));
replacedert.replacedertEquals(entry.getValue(), queryResult.get(0)[0]);
}
}
}
}
@Test
public void testArrayOfStructsInTable() throws IOException {
Schema schema = new Schema(required(1, "arrayofstructs", Types.ListType.ofRequired(2, Types.StructType.of(required(3, "something", Types.DoubleType.get()), required(4, "someone", Types.LongType.get()), required(5, "somewhere", Types.StringType.get())))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1);
// access an element from a struct in an array
for (int i = 0; i < records.size(); i++) {
List<?> expectedList = (List<?>) records.get(i).getField("arrayofstructs");
for (int j = 0; j < expectedList.size(); j++) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT arrayofstructs[%d].something, " + "arrayofstructs[%d].someone, arrayofstructs[%d].somewhere FROM default.arraytable LIMIT 1 " + "OFFSET %d", j, j, j, i));
GenericRecord genericRecord = (GenericRecord) expectedList.get(j);
replacedert.replacedertEquals(genericRecord.getField("something"), queryResult.get(0)[0]);
replacedert.replacedertEquals(genericRecord.getField("someone"), queryResult.get(0)[1]);
replacedert.replacedertEquals(genericRecord.getField("somewhere"), queryResult.get(0)[2]);
}
}
}
@Test
public void testMapOfPrimitivesInTable() throws IOException {
Schema schema = new Schema(required(1, "mapofprimitives", Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.IntegerType.get())));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1);
// access a single value from the map
for (int i = 0; i < records.size(); i++) {
Map<?, ?> expectedMap = (Map<?, ?>) records.get(i).getField("mapofprimitives");
for (Map.Entry<?, ?> entry : expectedMap.entrySet()) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT mapofprimitives[\"%s\"] " + "FROM default.maptable LIMIT 1 OFFSET %d", entry.getKey(), i));
replacedert.replacedertEquals(entry.getValue(), queryResult.get(0)[0]);
}
}
}
@Test
public void testMapOfArraysInTable() throws IOException {
Schema schema = new Schema(required(1, "mapofarrays", Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.ListType.ofRequired(4, Types.DateType.get()))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1);
// access a single element from a list in a map
for (int i = 0; i < records.size(); i++) {
Map<?, ?> expectedMap = (Map<?, ?>) records.get(i).getField("mapofarrays");
for (Map.Entry<?, ?> entry : expectedMap.entrySet()) {
List<?> expectedList = (List<?>) entry.getValue();
for (int j = 0; j < expectedList.size(); j++) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT mapofarrays[\"%s\"]" + "[%d] FROM maptable LIMIT 1 OFFSET %d", entry.getKey(), j, i));
replacedert.replacedertEquals(expectedList.get(j).toString(), queryResult.get(0)[0]);
}
}
}
}
@Test
public void testMapOfMapsInTable() throws IOException {
Schema schema = new Schema(required(1, "mapofmaps", Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.StringType.get()))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1);
// access a single element from a map in a map
for (int i = 0; i < records.size(); i++) {
Map<?, ?> expectedMap = (Map<?, ?>) records.get(i).getField("mapofmaps");
for (Map.Entry<?, ?> entry : expectedMap.entrySet()) {
Map<?, ?> expectedInnerMap = (Map<?, ?>) entry.getValue();
for (Map.Entry<?, ?> innerEntry : expectedInnerMap.entrySet()) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT mapofmaps[\"%s\"]" + "[\"%s\"] FROM maptable LIMIT 1 OFFSET %d", entry.getKey(), innerEntry.getKey(), i));
replacedert.replacedertEquals(innerEntry.getValue(), queryResult.get(0)[0]);
}
}
}
}
@Test
public void testMapOfStructsInTable() throws IOException {
Schema schema = new Schema(required(1, "mapofstructs", Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StructType.of(required(4, "something", Types.DoubleType.get()), required(5, "someone", Types.LongType.get()), required(6, "somewhere", Types.StringType.get())))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1);
// access a single element from a struct in a map
for (int i = 0; i < records.size(); i++) {
Map<?, ?> expectedMap = (Map<?, ?>) records.get(i).getField("mapofstructs");
for (Map.Entry<?, ?> entry : expectedMap.entrySet()) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT mapofstructs[\"%s\"].something, " + "mapofstructs[\"%s\"].someone, mapofstructs[\"%s\"].somewhere FROM default.maptable LIMIT 1 " + "OFFSET %d", entry.getKey(), entry.getKey(), entry.getKey(), i));
GenericRecord genericRecord = (GenericRecord) entry.getValue();
replacedert.replacedertEquals(genericRecord.getField("something"), queryResult.get(0)[0]);
replacedert.replacedertEquals(genericRecord.getField("someone"), queryResult.get(0)[1]);
replacedert.replacedertEquals(genericRecord.getField("somewhere"), queryResult.get(0)[2]);
}
}
}
@Test
public void testStructOfPrimitivesInTable() throws IOException {
Schema schema = new Schema(required(1, "structofprimitives", Types.StructType.of(required(2, "key", Types.StringType.get()), required(3, "value", Types.IntegerType.get()))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1);
// access a single value in a struct
for (int i = 0; i < records.size(); i++) {
GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofprimitives");
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT structofprimitives.key, structofprimitives.value FROM default.structtable LIMIT 1 OFFSET %d", i));
replacedert.replacedertEquals(expectedStruct.getField("key"), queryResult.get(0)[0]);
replacedert.replacedertEquals(expectedStruct.getField("value"), queryResult.get(0)[1]);
}
}
@Test
public void testStructOfArraysInTable() throws IOException {
Schema schema = new Schema(required(1, "structofarrays", Types.StructType.of(required(2, "names", Types.ListType.ofRequired(3, Types.StringType.get())), required(4, "birthdays", Types.ListType.ofRequired(5, Types.DateType.get())))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1);
// access an element of an array inside a struct
for (int i = 0; i < records.size(); i++) {
GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofarrays");
List<?> expectedList = (List<?>) expectedStruct.getField("names");
for (int j = 0; j < expectedList.size(); j++) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT structofarrays.names[%d] FROM default.structtable LIMIT 1 OFFSET %d", j, i));
replacedert.replacedertEquals(expectedList.get(j), queryResult.get(0)[0]);
}
expectedList = (List<?>) expectedStruct.getField("birthdays");
for (int j = 0; j < expectedList.size(); j++) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT structofarrays.birthdays[%d] FROM default.structtable LIMIT 1 OFFSET %d", j, i));
replacedert.replacedertEquals(expectedList.get(j).toString(), queryResult.get(0)[0]);
}
}
}
@Test
public void testStructOfMapsInTable() throws IOException {
Schema schema = new Schema(required(1, "structofmaps", Types.StructType.of(required(2, "map1", Types.MapType.ofRequired(3, 4, Types.StringType.get(), Types.StringType.get())), required(5, "map2", Types.MapType.ofRequired(6, 7, Types.StringType.get(), Types.IntegerType.get())))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1);
// access a map entry inside a struct
for (int i = 0; i < records.size(); i++) {
GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofmaps");
Map<?, ?> expectedMap = (Map<?, ?>) expectedStruct.getField("map1");
for (Map.Entry<?, ?> entry : expectedMap.entrySet()) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT structofmaps.map1[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", entry.getKey(), i));
replacedert.replacedertEquals(entry.getValue(), queryResult.get(0)[0]);
}
expectedMap = (Map<?, ?>) expectedStruct.getField("map2");
for (Map.Entry<?, ?> entry : expectedMap.entrySet()) {
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT structofmaps.map2[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", entry.getKey(), i));
replacedert.replacedertEquals(entry.getValue(), queryResult.get(0)[0]);
}
}
}
@Test
public void testStructOfStructsInTable() throws IOException {
Schema schema = new Schema(required(1, "structofstructs", Types.StructType.of(required(2, "struct1", Types.StructType.of(required(3, "key", Types.StringType.get()), required(4, "value", Types.IntegerType.get()))))));
List<Record> records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1);
// access a struct element inside a struct
for (int i = 0; i < records.size(); i++) {
GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofstructs");
GenericRecord expectedInnerStruct = (GenericRecord) expectedStruct.getField("struct1");
List<Object[]> queryResult = shell.executeStatement(String.format("SELECT structofstructs.struct1.key, structofstructs.struct1.value FROM default.structtable " + "LIMIT 1 OFFSET %d", i));
replacedert.replacedertEquals(expectedInnerStruct.getField("key"), queryResult.get(0)[0]);
replacedert.replacedertEquals(expectedInnerStruct.getField("value"), queryResult.get(0)[1]);
}
}
private void runCreateAndReadTest(TableIdentifier identifier, String createSQL, Schema expectedSchema, ParreplacedionSpec expectedSpec, Map<StructLike, List<Record>> data) throws IOException {
shell.executeStatement(createSQL);
org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier);
replacedert.replacedertEquals(expectedSchema.replacedtruct(), icebergTable.schema().replacedtruct());
replacedert.replacedertEquals(expectedSpec, icebergTable.spec());
List<Record> expected = Lists.newArrayList();
for (StructLike parreplacedion : data.keySet()) {
testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, fileFormat, parreplacedion, data.get(parreplacedion));
expected.addAll(data.get(parreplacedion));
}
List<Object[]> descRows = shell.executeStatement("SELECT * FROM " + identifier.toString());
List<Record> records = HiveIcebergTestUtils.valueForRow(icebergTable.schema(), descRows);
HiveIcebergTestUtils.validateData(expected, records, 0);
}
}
15
Source : TestFlinkScan.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@RunWith(Parameterized.clreplaced)
public abstract clreplaced TestFlinkScan {
@ClreplacedRule
public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClreplacedloaderCheckDisabled();
@ClreplacedRule
public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder();
protected HadoopCatalog catalog;
protected String warehouse;
protected String location;
// parametrized variables
protected final FileFormat fileFormat;
@Parameterized.Parameters(name = "format={0}")
public static Object[] parameters() {
return new Object[] { "avro", "parquet", "orc" };
}
TestFlinkScan(String fileFormat) {
this.fileFormat = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH));
}
@Before
public void before() throws IOException {
File warehouseFile = TEMPORARY_FOLDER.newFolder();
replacedert.replacedertTrue(warehouseFile.delete());
// before variables
warehouse = "file:" + warehouseFile;
Configuration conf = new Configuration();
catalog = new HadoopCatalog(conf, warehouse);
location = String.format("%s/%s/%s", warehouse, TestFixtures.DATABASE, TestFixtures.TABLE);
}
@After
public void after() throws IOException {
}
protected TableLoader tableLoader() {
return TableLoader.fromHadoopTable(location);
}
protected abstract List<Row> runWithProjection(String... projected) throws Exception;
protected abstract List<Row> runWithFilter(Expression filter, String sqlFilter) throws Exception;
protected abstract List<Row> runWithOptions(Map<String, String> options) throws Exception;
protected abstract List<Row> run() throws Exception;
@Test
public void testUnparreplacedionedTable() throws Exception {
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA);
List<Record> expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L);
new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(expectedRecords);
TestHelpers.replacedertRecords(run(), expectedRecords, TestFixtures.SCHEMA);
}
@Test
public void testParreplacedionedTable() throws Exception {
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC);
List<Record> expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L);
expectedRecords.get(0).set(2, "2020-03-20");
new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords);
TestHelpers.replacedertRecords(run(), expectedRecords, TestFixtures.SCHEMA);
}
@Test
public void testProjection() throws Exception {
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC);
List<Record> inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L);
new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords);
replacedertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0)));
}
@Test
public void testIdenreplacedyParreplacedionProjections() throws Exception {
Schema logSchema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get()), Types.NestedField.optional(2, "dt", Types.StringType.get()), Types.NestedField.optional(3, "level", Types.StringType.get()), Types.NestedField.optional(4, "message", Types.StringType.get()));
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(logSchema).idenreplacedy("dt").idenreplacedy("level").build();
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, logSchema, spec);
List<Record> inputRecords = RandomGenericData.generate(logSchema, 10, 0L);
int idx = 0;
AppendFiles append = table.newAppend();
for (Record record : inputRecords) {
record.set(1, "2020-03-2" + idx);
record.set(2, Integer.toString(idx));
append.appendFile(new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), ImmutableList.of(record)));
idx += 1;
}
append.commit();
// individual fields
validateIdenreplacedyParreplacedionProjections(table, Collections.singletonList("dt"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Collections.singletonList("level"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Collections.singletonList("message"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Collections.singletonList("id"), inputRecords);
// field pairs
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("dt", "message"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("level", "message"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("dt", "level"), inputRecords);
// out-of-order pairs
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("message", "dt"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("message", "level"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("level", "dt"), inputRecords);
// out-of-order triplets
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("dt", "level", "message"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("level", "dt", "message"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("dt", "message", "level"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("level", "message", "dt"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("message", "dt", "level"), inputRecords);
validateIdenreplacedyParreplacedionProjections(table, Arrays.asList("message", "level", "dt"), inputRecords);
}
private void validateIdenreplacedyParreplacedionProjections(Table table, List<String> projectedFields, List<Record> inputRecords) throws Exception {
List<Row> rows = runWithProjection(projectedFields.toArray(new String[0]));
for (int pos = 0; pos < inputRecords.size(); pos++) {
Record inputRecord = inputRecords.get(pos);
Row actualRecord = rows.get(pos);
for (int i = 0; i < projectedFields.size(); i++) {
String name = projectedFields.get(i);
replacedert.replacedertEquals("Projected field " + name + " should match", inputRecord.getField(name), actualRecord.getField(i));
}
}
}
@Test
public void testSnapshotReads() throws Exception {
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA);
GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER);
List<Record> expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L);
helper.appendToTable(expectedRecords);
long snapshotId = table.currentSnapshot().snapshotId();
long timestampMillis = table.currentSnapshot().timestampMillis();
// produce another timestamp
waitUntilAfter(timestampMillis);
helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L));
TestHelpers.replacedertRecords(runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), expectedRecords, TestFixtures.SCHEMA);
TestHelpers.replacedertRecords(runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), expectedRecords, TestFixtures.SCHEMA);
}
@Test
public void testIncrementalRead() throws Exception {
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA);
GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER);
List<Record> records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L);
helper.appendToTable(records1);
long snapshotId1 = table.currentSnapshot().snapshotId();
// snapshot 2
List<Record> records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L);
helper.appendToTable(records2);
List<Record> records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L);
helper.appendToTable(records3);
long snapshotId3 = table.currentSnapshot().snapshotId();
// snapshot 4
helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L));
List<Record> expected2 = Lists.newArrayList();
expected2.addAll(records2);
expected2.addAll(records3);
TestHelpers.replacedertRecords(runWithOptions(ImmutableMap.<String, String>builder().put("start-snapshot-id", Long.toString(snapshotId1)).put("end-snapshot-id", Long.toString(snapshotId3)).build()), expected2, TestFixtures.SCHEMA);
}
@Test
public void testFilterExp() throws Exception {
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC);
List<Record> expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L);
expectedRecords.get(0).set(2, "2020-03-20");
expectedRecords.get(1).set(2, "2020-03-20");
GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER);
DataFile dataFile1 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords);
DataFile dataFile2 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L));
helper.appendToTable(dataFile1, dataFile2);
TestHelpers.replacedertRecords(runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), expectedRecords, TestFixtures.SCHEMA);
}
@Test
public void testParreplacedionTypes() throws Exception {
Schema typesSchema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get()), Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), Types.NestedField.optional(3, "str", Types.StringType.get()), Types.NestedField.optional(4, "binary", Types.BinaryType.get()), Types.NestedField.optional(5, "date", Types.DateType.get()), Types.NestedField.optional(6, "time", Types.TimeType.get()), Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()));
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(typesSchema).idenreplacedy("decimal").idenreplacedy("str").idenreplacedy("binary").idenreplacedy("date").idenreplacedy("time").idenreplacedy("timestamp").build();
Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec);
List<Record> records = RandomGenericData.generate(typesSchema, 10, 0L);
GenericAppenderHelper appender = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER);
for (Record record : records) {
org.apache.iceberg.TestHelpers.Row parreplacedion = org.apache.iceberg.TestHelpers.Row.of(record.get(1), record.get(2), record.get(3), record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), record.get(6) == null ? null : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6)));
appender.appendToTable(parreplacedion, Collections.singletonList(record));
}
TestHelpers.replacedertRecords(run(), records, typesSchema);
}
private static void replacedertRows(List<Row> results, Row... expected) {
TestHelpers.replacedertRows(results, Arrays.asList(expected));
}
private static void waitUntilAfter(long timestampMillis) {
long current = System.currentTimeMillis();
while (current <= timestampMillis) {
current = System.currentTimeMillis();
}
}
}
See More Examples