Here are the examples of the java api org.apache.iceberg.FileScanTask taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
54 Examples
19
Source : IcebergSplitSource.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
private ConnectorSplit toIcebergSplit(FileScanTask task) {
// TODO: We should leverage residual expression and convert that to TupleDomain.
// The predicate here is used by readers for predicate push down at reader level,
// so when we do not use residual expression, we are just wasting CPU cycles
// on reader side evaluating a condition that we know will always be true.
return new IcebergSplit(task.file().path().toString(), task.start(), task.length(), task.file().fileSizeInBytes(), task.file().format(), ImmutableList.of(), getParreplacedionKeys(task));
}
19
Source : TestSparkBaseDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testClosureDuringIteration() throws IOException {
Integer totalTasks = 2;
Integer recordPerTask = 1;
List<FileScanTask> tasks = createFileScanTasks(totalTasks, recordPerTask);
replacedert.replacedertEquals(2, tasks.size());
FileScanTask firstTask = tasks.get(0);
FileScanTask secondTask = tasks.get(1);
ClosureTrackingReader reader = new ClosureTrackingReader(tasks);
// Total of 2 elements
replacedert.replacedertTrue(reader.next());
replacedert.replacedertFalse("First iter should not be closed on its last element", reader.isIteratorClosed(firstTask));
replacedert.replacedertTrue(reader.next());
replacedert.replacedertTrue("First iter should be closed after moving to second iter", reader.isIteratorClosed(firstTask));
replacedert.replacedertFalse("Second iter should not be closed on its last element", reader.isIteratorClosed(secondTask));
replacedert.replacedertFalse(reader.next());
replacedert.replacedertTrue(reader.isIteratorClosed(firstTask));
replacedert.replacedertTrue(reader.isIteratorClosed(secondTask));
}
19
Source : RowDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
protected CloseableIterable<InternalRow> open(FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
CloseableIterable<InternalRow> iter;
if (task.isDataTask()) {
iter = newDataIterable(task.asDataTask(), readSchema);
} else {
InputFile location = getInputFile(task);
Preconditions.checkNotNull(location, "Could not find InputFile replacedociated with FileScanTask");
switch(task.file().format()) {
case PARQUET:
iter = newParquereplacederable(location, task, readSchema, idToConstant);
break;
case AVRO:
iter = newAvroIterable(location, task, readSchema, idToConstant);
break;
case ORC:
iter = newOrcIterable(location, task, readSchema, idToConstant);
break;
default:
throw new UnsupportedOperationException("Cannot read unknown format: " + task.file().format());
}
}
return iter;
}
19
Source : GenericReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public CloseableIterable<Record> open(FileScanTask task) {
DeleteFilter<Record> deletes = new GenericDeleteFilter(io, task, tableSchema, projection);
Schema readSchema = deletes.requiredSchema();
CloseableIterable<Record> records = openFile(task, readSchema);
records = deletes.filter(records);
records = applyResidual(records, readSchema, task.residual());
return records;
}
19
Source : TableScanUtil.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public static boolean hasDeletes(FileScanTask task) {
return !task.deletes().isEmpty();
}
19
Source : PartitionUtil.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public static Map<Integer, ?> constantsMap(FileScanTask task) {
return constantsMap(task, (type, constant) -> constant);
}
18
Source : BaseRewriteDataFilesAction.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private boolean isPartialFileScan(CombinedScanTask task) {
if (task.files().size() == 1) {
FileScanTask fileScanTask = task.files().iterator().next();
return fileScanTask.file().fileSizeInBytes() != fileScanTask.length();
} else {
return false;
}
}
17
Source : TestIcebergCTASWithPartition.java
with Apache License 2.0
from dremio
with Apache License 2.0
from dremio
private void verifyParreplacedionValue(String tableFolder, Clreplaced expectedClreplaced, Object expectedValue) {
Table table = new HadoopTables(new Configuration()).load(tableFolder);
for (FileScanTask fileScanTask : table.newScan().planFiles()) {
StructLike structLike = fileScanTask.file().parreplacedion();
replacedert.replacedertEquals(structLike.get(0, expectedClreplaced), expectedValue);
}
}
17
Source : Reader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public Statistics estimateStatistics() {
// its a fresh table, no data
if (table.currentSnapshot() == null) {
return new Stats(0L, 0L);
}
// estimate stats using snapshot summary only for parreplacedioned tables (metadata tables are unparreplacedioned)
if (!table.spec().isUnparreplacedioned() && filterExpression() == Expressions.alwaysTrue()) {
long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
return new Stats(SparkSchemaUtil.estimateSize(lazyType(), totalRecords), totalRecords);
}
long sizeInBytes = 0L;
long numRows = 0L;
for (CombinedScanTask task : tasks()) {
for (FileScanTask file : task.files()) {
sizeInBytes += file.length();
numRows += file.file().recordCount();
}
}
return new Stats(sizeInBytes, numRows);
}
17
Source : RowDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<InternalRow> newAvroIterable(InputFile location, FileScanTask task, Schema projection, Map<Integer, ?> idToConstant) {
Avro.ReadBuilder builder = Avro.read(location).reuseContainers().project(projection).split(task.start(), task.length()).createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));
if (nameMapping != null) {
builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
}
return builder.build();
}
17
Source : RowDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<InternalRow> newParquereplacederable(InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
Parquet.ReadBuilder builder = Parquet.read(location).reuseContainers().split(task.start(), task.length()).project(readSchema).createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive);
if (nameMapping != null) {
builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
}
return builder.build();
}
16
Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup
with Apache License 2.0
from ExpediaGroup
private CloseableIterable buildParquetReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) {
Parquet.ReadBuilder builder = Parquet.read(file).createReaderFunc(messageType -> GenericParquetReaders.buildReader(schema, messageType)).project(schema).filter(task.residual()).split(task.start(), task.length());
if (reuseContainers) {
builder.reuseContainers();
}
return builder.build();
}
16
Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup
with Apache License 2.0
from ExpediaGroup
private CloseableIterable buildAvroReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) {
Avro.ReadBuilder builder = Avro.read(file).createReaderFunc(DataReader::create).project(schema).split(task.start(), task.length());
if (reuseContainers) {
builder.reuseContainers();
}
return builder.build();
}
16
Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup
with Apache License 2.0
from ExpediaGroup
private CloseableIterable buildOrcReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) {
ORC.ReadBuilder builder = ORC.read(file).project(schema).filter(task.residual()).split(task.start(), task.length());
return builder.build();
}
16
Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup
with Apache License 2.0
from ExpediaGroup
public Iterable<Record> createReader(DataFile file, FileScanTask currentTask, InputFile inputFile, Schema tableSchema, boolean reuseContainers, Table table) {
switch(file.format()) {
case AVRO:
return buildAvroReader(currentTask, inputFile, tableSchema, reuseContainers);
case ORC:
return buildOrcReader(currentTask, inputFile, tableSchema, reuseContainers);
case PARQUET:
return buildParquetReader(currentTask, inputFile, tableSchema, reuseContainers);
case METADATA:
return buildMetadataReader(table);
default:
throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", file.format().name(), file.path()));
}
}
16
Source : IcebergTableWrapper.java
with Apache License 2.0
from dremio
with Apache License 2.0
from dremio
// build the list of "distinct parreplacedion values" and the corresponding dataset splits.
// TODO: this should be optimised to handle deltas.
private void buildParreplacedionsAndSplits() throws IOException {
ParreplacedionConverter parreplacedionConverter = new ParreplacedionConverter(schema);
SplitConverter splitConverter = new SplitConverter(context, fs, schema, datasetColumnValueCounts);
// map of distinct parreplacedion values.
// iterate over all data files to get the parreplacedion values and them to the map.
// TODO ravindra: this iteration requires reading all of the manifest files. This should go via
// the dremio wrappers.
for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
List<ParreplacedionValue> parreplacedion = parreplacedionConverter.from(task);
DatasetSplit split = splitConverter.from(task);
parreplacedionChunkListing.put(parreplacedion, split);
recordCount += task.file().recordCount();
}
}
16
Source : BaseDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
protected InputFile getInputFile(FileScanTask task) {
Preconditions.checkArgument(!task.isDataTask(), "Invalid task type");
return inputFiles.get(task.file().path().toString());
}
16
Source : TestRewriteDataFilesAction.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testRewriteLargeTableHasResiduals() throws IOException {
// all records belong to the same parreplacedion
List<String> records1 = Lists.newArrayList();
List<String> records2 = Lists.newArrayList();
List<Record> expected = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
int id = i;
String data = String.valueOf(i % 3);
if (i % 2 == 0) {
records1.add("(" + id + ",'" + data + "')");
} else {
records2.add("(" + id + ",'" + data + "')");
}
Record record = RECORD.copy();
record.setField("id", id);
record.setField("data", data);
expected.add(record);
}
sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARreplacedIONED);
sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARreplacedIONED);
icebergTableUnParreplacedioned.refresh();
CloseableIterable<FileScanTask> tasks = icebergTableUnParreplacedioned.newScan().ignoreResiduals().filter(Expressions.equal("data", "0")).planFiles();
for (FileScanTask task : tasks) {
replacedert.replacedertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
}
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
replacedert.replacedertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
Actions actions = Actions.forTable(icebergTableUnParreplacedioned);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute();
replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
// replacedert the table records as expected.
SimpleDataUtil.replacedertTableRecords(icebergTableUnParreplacedioned, expected);
}
16
Source : RowDataIterator.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<RowData> newIterable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
CloseableIterable<RowData> iter;
if (task.isDataTask()) {
throw new UnsupportedOperationException("Cannot read data task.");
} else {
switch(task.file().format()) {
case PARQUET:
iter = newParquereplacederable(task, schema, idToConstant);
break;
case AVRO:
iter = newAvroIterable(task, schema, idToConstant);
break;
case ORC:
iter = newOrcIterable(task, schema, idToConstant);
break;
default:
throw new UnsupportedOperationException("Cannot read unknown format: " + task.file().format());
}
}
return iter;
}
16
Source : DataIterator.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
InputFile getInputFile(FileScanTask task) {
Preconditions.checkArgument(!task.isDataTask(), "Invalid task type");
return inputFiles.get(task.file().path().toString());
}
16
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testEqualityDeletePlanningStats() throws IOException {
table.newAppend().appendFile(dataFile).commit();
List<Record> deletes = Lists.newArrayList();
Schema deleteRowSchema = SCHEMA.select("data");
Record delete = GenericRecord.create(deleteRowSchema);
deletes.add(delete.copy("data", "d"));
DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
table.newRowDelta().addDeletes(posDeletes).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should have one delete file, data contains a matching value", 1, task.deletes().size());
}
16
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testEqualityDeletePlanningStatsSomeNullValuesWithSomeNullDeletes() throws IOException {
table.newAppend().appendFile(// note that there are some nulls in the data column
dataFile).commit();
List<Record> deletes = Lists.newArrayList();
Schema deleteRowSchema = SCHEMA.select("data");
Record delete = GenericRecord.create(deleteRowSchema);
// the data and delete ranges do not overlap, but both contain null
deletes.add(delete.copy("data", null));
deletes.add(delete.copy("data", "x"));
DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
table.newRowDelta().addDeletes(posDeletes).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should have one delete file, data and deletes have null values", 1, task.deletes().size());
}
16
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testEqualityDeletePlanningStatsAllNullValuesWithNoNullDeletes() throws IOException {
table.newAppend().appendFile(// note that there are only nulls in the data column
dataFileOnlyNulls).commit();
List<Record> deletes = Lists.newArrayList();
Schema deleteRowSchema = SCHEMA.select("data");
Record delete = GenericRecord.create(deleteRowSchema);
deletes.add(delete.copy("data", "d"));
DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
table.newRowDelta().addDeletes(posDeletes).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should have no delete files, data contains no null values", 0, task.deletes().size());
}
16
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testEqualityDeletePlanningStatsNullValueWithAllNullDeletes() throws IOException {
table.newAppend().appendFile(dataFile).commit();
List<Record> deletes = Lists.newArrayList();
Schema deleteRowSchema = SCHEMA.select("data");
Record delete = GenericRecord.create(deleteRowSchema);
deletes.add(delete.copy("data", null));
DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
table.newRowDelta().addDeletes(posDeletes).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should have delete file, data contains a null value", 1, task.deletes().size());
}
16
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testEqualityDeletePlanningStatsNoNullValuesWithAllNullDeletes() throws IOException {
table.newAppend().appendFile(// note that there are no nulls in the data column
dataFileWithoutNulls).commit();
List<Record> deletes = Lists.newArrayList();
Schema deleteRowSchema = SCHEMA.select("data");
Record delete = GenericRecord.create(deleteRowSchema);
deletes.add(delete.copy("data", null));
DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
table.newRowDelta().addDeletes(posDeletes).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should have no delete files, data contains no null values", 0, task.deletes().size());
}
15
Source : EqualityDeleteRowReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
SparkDeleteFilter matches = new SparkDeleteFilter(task, tableSchema(), expectedSchema);
// schema or rows returned by readers
Schema requiredSchema = matches.requiredSchema();
Map<Integer, ?> idToConstant = ParreplacedionUtil.constantsMap(task, RowDataReader::convertConstant);
DataFile file = task.file();
// update the current file for Spark's filename() function
InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());
return matches.findEqualityDeleteRows(open(task, requiredSchema, idToConstant)).iterator();
}
15
Source : BaseDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
/**
* Base clreplaced of Spark readers.
*
* @param <T> is the Java clreplaced returned by this reader whose objects contain one or more rows.
*/
abstract clreplaced BaseDataReader<T> implements Closeable {
private static final Logger LOG = LoggerFactory.getLogger(BaseDataReader.clreplaced);
private final Iterator<FileScanTask> tasks;
private final Map<String, InputFile> inputFiles;
private CloseableIterator<T> currenreplacederator;
private T current = null;
private FileScanTask currentTask = null;
BaseDataReader(CombinedScanTask task, FileIO io, EncryptionManager encryptionManager) {
this.tasks = task.files().iterator();
Map<String, ByteBuffer> keyMetadata = Maps.newHashMap();
task.files().stream().flatMap(fileScanTask -> Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())).forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata()));
Stream<EncryptedInputFile> encrypted = keyMetadata.entrySet().stream().map(entry -> EncryptedFiles.encryptedInput(io.newInputFile(entry.getKey()), entry.getValue()));
// decrypt with the batch call to avoid multiple RPCs to a key server, if possible
Iterable<InputFile> decryptedFiles = encryptionManager.decrypt(encrypted::iterator);
Map<String, InputFile> files = Maps.newHashMapWithExpectedSize(task.files().size());
decryptedFiles.forEach(decrypted -> files.putIfAbsent(decrypted.location(), decrypted));
this.inputFiles = Collections.unmodifiableMap(files);
this.currenreplacederator = CloseableIterator.empty();
}
public boolean next() throws IOException {
try {
while (true) {
if (currenreplacederator.hasNext()) {
this.current = currenreplacederator.next();
return true;
} else if (tasks.hasNext()) {
this.currenreplacederator.close();
this.currentTask = tasks.next();
this.currenreplacederator = open(currentTask);
} else {
this.currenreplacederator.close();
return false;
}
}
} catch (IOException | RuntimeException e) {
if (currentTask != null && !currentTask.isDataTask()) {
LOG.error("Error reading file: {}", getInputFile(currentTask).location(), e);
}
throw e;
}
}
public T get() {
return current;
}
abstract CloseableIterator<T> open(FileScanTask task);
@Override
public void close() throws IOException {
InputFileBlockHolder.unset();
// close the current iterator
this.currenreplacederator.close();
// exhaust the task iterator
while (tasks.hasNext()) {
tasks.next();
}
}
protected InputFile getInputFile(FileScanTask task) {
Preconditions.checkArgument(!task.isDataTask(), "Invalid task type");
return inputFiles.get(task.file().path().toString());
}
protected InputFile getInputFile(String location) {
return inputFiles.get(location);
}
protected static Object convertConstant(Type type, Object value) {
if (value == null) {
return null;
}
switch(type.typeId()) {
case DECIMAL:
return Decimal.apply((BigDecimal) value);
case STRING:
if (value instanceof Utf8) {
Utf8 utf8 = (Utf8) value;
return UTF8String.fromBytes(utf8.getBytes(), 0, utf8.getByteLength());
}
return UTF8String.fromString(value.toString());
case FIXED:
if (value instanceof byte[]) {
return value;
} else if (value instanceof GenericData.Fixed) {
return ((GenericData.Fixed) value).bytes();
}
return ByteBuffers.toByteArray((ByteBuffer) value);
case BINARY:
return ByteBuffers.toByteArray((ByteBuffer) value);
default:
}
return value;
}
}
15
Source : RowDataIterator.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<RowData> newAvroIterable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
Avro.ReadBuilder builder = Avro.read(getInputFile(task)).reuseContainers().project(schema).split(task.start(), task.length()).createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant));
if (nameMapping != null) {
builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
}
return builder.build();
}
15
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testEqualityDeletePlanningStatsFilter() throws IOException {
table.newAppend().appendFile(dataFile).commit();
List<Record> deletes = Lists.newArrayList();
Schema deleteRowSchema = table.schema().select("data");
Record delete = GenericRecord.create(deleteRowSchema);
deletes.add(delete.copy("data", "x"));
deletes.add(delete.copy("data", "y"));
deletes.add(delete.copy("data", "z"));
DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
table.newRowDelta().addDeletes(posDeletes).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should not have delete file, filtered by data column stats", 0, task.deletes().size());
}
14
Source : IcebergUtil.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
public static Map<Integer, String> getParreplacedionKeys(FileScanTask scanTask) {
StructLike parreplacedion = scanTask.file().parreplacedion();
ParreplacedionSpec spec = scanTask.spec();
Map<ParreplacedionField, Integer> fieldToIndex = getIdenreplacedyParreplacedions(spec);
Map<Integer, String> parreplacedionKeys = new HashMap<>();
fieldToIndex.forEach((field, index) -> {
int id = field.sourceId();
org.apache.iceberg.types.Type type = spec.schema().findType(id);
Clreplaced<?> javaClreplaced = type.typeId().javaClreplaced();
Object value = parreplacedion.get(index, javaClreplaced);
if (value == null) {
parreplacedionKeys.put(id, null);
} else {
String parreplacedionValue;
if (type.typeId() == FIXED || type.typeId() == BINARY) {
// this is safe because Iceberg ParreplacedionData directly wraps the byte array
parreplacedionValue = new String(((ByteBuffer) value).array(), UTF_8);
} else {
parreplacedionValue = value.toString();
}
parreplacedionKeys.put(id, parreplacedionValue);
}
});
return Collections.unmodifiableMap(parreplacedionKeys);
}
14
Source : SparkBatchScan.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
public Statistics estimateStatistics() {
// its a fresh table, no data
if (table.currentSnapshot() == null) {
return new Stats(0L, 0L);
}
// estimate stats using snapshot summary only for parreplacedioned tables (metadata tables are unparreplacedioned)
if (!table.spec().isUnparreplacedioned() && filterExpressions.isEmpty()) {
LOG.debug("using table metadata to estimate table statistics");
long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
Schema projectedSchema = expectedSchema != null ? expectedSchema : table.schema();
return new Stats(SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(projectedSchema), totalRecords), totalRecords);
}
long sizeInBytes = 0L;
long numRows = 0L;
for (CombinedScanTask task : tasks()) {
for (FileScanTask file : task.files()) {
sizeInBytes += file.length();
numRows += file.file().recordCount();
}
}
return new Stats(sizeInBytes, numRows);
}
14
Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testCountMetricsCollectionForParquet() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Map<String, String> properties = Maps.newHashMap();
properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
DataFile file = task.file();
replacedert.replacedertEquals(2, file.nullValueCounts().size());
replacedert.replacedertEquals(2, file.valueCounts().size());
replacedert.replacedertTrue(file.lowerBounds().isEmpty());
replacedert.replacedertTrue(file.upperBounds().isEmpty());
}
}
14
Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testFullMetricsCollectionForParquet() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Map<String, String> properties = Maps.newHashMap();
properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full");
Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
DataFile file = task.file();
replacedert.replacedertEquals(2, file.nullValueCounts().size());
replacedert.replacedertEquals(2, file.valueCounts().size());
replacedert.replacedertEquals(2, file.lowerBounds().size());
replacedert.replacedertEquals(2, file.upperBounds().size());
}
}
14
Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testNoMetricsCollectionForParquet() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Map<String, String> properties = Maps.newHashMap();
properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
DataFile file = task.file();
replacedert.replacedertTrue(file.nullValueCounts().isEmpty());
replacedert.replacedertTrue(file.valueCounts().isEmpty());
replacedert.replacedertTrue(file.lowerBounds().isEmpty());
replacedert.replacedertTrue(file.upperBounds().isEmpty());
}
}
14
Source : TestRewriteDataFilesAction.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testRewriteLargeTableHasResiduals() {
ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).build();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100");
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
// all records belong to the same parreplacedion
List<ThreeColumnRecord> records = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i % 4)));
}
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
writeDF(df);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles();
for (FileScanTask task : tasks) {
replacedert.replacedertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
}
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
replacedert.replacedertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c3", "0")).execute();
replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
table.refresh();
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
replacedert.replacedertEquals("Rows must match", records, actualRecords);
}
14
Source : RowDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
SparkDeleteFilter deletes = new SparkDeleteFilter(task, tableSchema, expectedSchema);
// schema or rows returned by readers
Schema requiredSchema = deletes.requiredSchema();
Map<Integer, ?> idToConstant = ParreplacedionUtil.constantsMap(task, RowDataReader::convertConstant);
DataFile file = task.file();
// update the current file for Spark's filename() function
InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());
return deletes.filter(open(task, requiredSchema, idToConstant)).iterator();
}
14
Source : RowDataIterator.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<RowData> newParquereplacederable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
Parquet.ReadBuilder builder = Parquet.read(getInputFile(task)).reuseContainers().split(task.start(), task.length()).project(schema).createReaderFunc(fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive).reuseContainers();
if (nameMapping != null) {
builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
}
return builder.build();
}
14
Source : Util.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
public static String[] blockLocations(FileIO io, CombinedScanTask task) {
Set<String> locations = Sets.newHashSet();
for (FileScanTask f : task.files()) {
InputFile in = io.newInputFile(f.file().path().toString());
if (in instanceof HadoopInputFile) {
Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length()));
}
}
return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE);
}
13
Source : IcebergSplitSource.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorParreplacedionHandle parreplacedionHandle, int maxSize) {
// TODO: move this to a background thread
List<ConnectorSplit> splits = new ArrayList<>();
Iterator<FileScanTask> iterator = limit(fileScanIterator, maxSize);
while (iterator.hasNext()) {
FileScanTask task = iterator.next();
splits.add(toIcebergSplit(task));
}
return completedFuture(new ConnectorSplitBatch(splits, isFinished()));
}
13
Source : RowDataReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<InternalRow> newOrcIterable(InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds()));
ORC.ReadBuilder builder = ORC.read(location).project(readSchemaWithoutConstantAndMetadataFields).split(task.start(), task.length()).createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive);
if (nameMapping != null) {
builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
}
return builder.build();
}
13
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testPositionDeletePlanningPathFilter() throws IOException {
table.newAppend().appendFile(dataFile).commit();
List<Pair<CharSequence, Long>> deletes = Lists.newArrayList();
deletes.add(Pair.of("some-other-file.parquet", 0L));
deletes.add(Pair.of("some-other-file.parquet", 1L));
Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes);
table.newRowDelta().addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should not have delete file, filtered by file_path stats", 0, task.deletes().size());
}
12
Source : TableStatisticsMaker.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
if (tableHandle.getSnapshotId().isEmpty() || constraint.getSummary().isNone()) {
return TableStatistics.empty();
}
TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.clreplaced::cast).intersect(tableHandle.getEnforcedPredicate());
if (intersection.isNone()) {
return TableStatistics.empty();
}
List<Types.NestedField> columns = icebergTable.schema().columns();
Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
List<ParreplacedionField> parreplacedionFields = icebergTable.spec().fields();
Set<Integer> idenreplacedyParreplacedionIds = getIdenreplacedyParreplacedions(icebergTable.spec()).keySet().stream().map(ParreplacedionField::sourceId).collect(toSet());
List<Types.NestedField> nonParreplacedionPrimitiveColumns = columns.stream().filter(column -> !idenreplacedyParreplacedionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
List<Type> icebergParreplacedionTypes = parreplacedionTypes(parreplacedionFields, idToTypeMapping);
List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toUnmodifiableMap(IcebergColumnHandle::getId, idenreplacedy()));
ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
for (int index = 0; index < parreplacedionFields.size(); index++) {
ParreplacedionField field = parreplacedionFields.get(index);
Type type = icebergParreplacedionTypes.get(index);
idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toTrinoType(type, typeManager), type.typeId().javaClreplaced()));
}
Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
Parreplacedion summary = null;
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
if (!dataFileMatches(dataFile, constraint, idToTypeMapping, parreplacedionFields, idToDetails)) {
continue;
}
if (summary == null) {
summary = new Parreplacedion(idToTypeMapping, nonParreplacedionPrimitiveColumns, dataFile.parreplacedion(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
} else {
summary.incrementFileCount();
summary.incrementRecordCount(dataFile.recordCount());
summary.incrementSize(dataFile.fileSizeInBytes());
updateSummaryMin(summary, parreplacedionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
updateSummaryMax(summary, parreplacedionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
summary.updateNullCount(dataFile.nullValueCounts());
updateColumnSizes(summary, dataFile.columnSizes());
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
if (summary == null) {
return TableStatistics.empty();
}
ImmutableMap.Builder<ColumnHandle, ColumnStatistics> columnHandleBuilder = ImmutableMap.builder();
double recordCount = summary.getRecordCount();
for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
int fieldId = columnHandle.getId();
ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
Long nullCount = summary.getNullCounts().get(fieldId);
if (nullCount != null) {
columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
}
if (summary.getColumnSizes() != null) {
Long columnSize = summary.getColumnSizes().get(fieldId);
if (columnSize != null) {
columnBuilder.setDataSize(Estimate.of(columnSize));
}
}
Object min = summary.getMinValues().get(fieldId);
Object max = summary.getMaxValues().get(fieldId);
if (min instanceof Number && max instanceof Number) {
columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
}
columnHandleBuilder.put(columnHandle, columnBuilder.build());
}
return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.build());
}
12
Source : TestIcebergPartitionData.java
with Apache License 2.0
from dremio
with Apache License 2.0
from dremio
private void verifyParreplacedionValue(ParreplacedionSpec parreplacedionSpec, IcebergParreplacedionData parreplacedionData, String columnName, Clreplaced expectedClreplaced, Object expectedValue) throws Exception {
String tableName = "icebergParreplacedionTest";
File tableFolder = new File(folder.getRoot(), tableName);
try {
tableFolder.mkdir();
File dataFile = new File(folder.getRoot(), "a.parquet");
dataFile.createNewFile();
DataFile d1 = DataFiles.builder(parreplacedionSpec).withInputFile(Files.localInput(dataFile)).withRecordCount(50).withFormat(FileFormat.PARQUET).withParreplacedion(parreplacedionData).build();
IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(tableName, Path.of(tableFolder.toPath().toString()), SchemaConverter.fromIceberg(schema), Lists.newArrayList(columnName), new Configuration());
committer.consumeData(Lists.newArrayList(d1));
committer.commit();
Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
for (FileScanTask fileScanTask : table.newScan().planFiles()) {
StructLike structLike = fileScanTask.file().parreplacedion();
if (expectedClreplaced == ByteBuffer.clreplaced) {
replacedert.replacedertEquals(structLike.get(0, expectedClreplaced).hashCode(), ByteBuffer.wrap((byte[]) expectedValue).hashCode());
} else {
replacedert.replacedertTrue(structLike.get(0, expectedClreplaced).equals(expectedValue));
}
}
} finally {
tableFolder.delete();
}
}
12
Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
Map<String, String> properties = Maps.newHashMap();
properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
properties.put("write.metadata.metrics.column.id", "full");
Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
Schema schema = table.schema();
Types.NestedField id = schema.findField("id");
for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
DataFile file = task.file();
replacedert.replacedertEquals(2, file.nullValueCounts().size());
replacedert.replacedertEquals(2, file.valueCounts().size());
replacedert.replacedertEquals(1, file.lowerBounds().size());
replacedert.replacedertTrue(file.lowerBounds().containsKey(id.fieldId()));
replacedert.replacedertEquals(1, file.upperBounds().size());
replacedert.replacedertTrue(file.upperBounds().containsKey(id.fieldId()));
}
}
12
Source : RowDataIterator.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<RowData> newOrcIterable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds()));
ORC.ReadBuilder builder = ORC.read(getInputFile(task)).project(readSchemaWithoutConstantAndMetadataFields).split(task.start(), task.length()).createReaderFunc(readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive);
if (nameMapping != null) {
builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
}
return builder.build();
}
12
Source : RowDataIterator.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Override
protected CloseableIterator<RowData> openTaskIterator(FileScanTask task) {
Schema parreplacedionSchema = TypeUtil.select(projectedSchema, task.spec().idenreplacedySourceIds());
Map<Integer, ?> idToConstant = parreplacedionSchema.columns().isEmpty() ? ImmutableMap.of() : ParreplacedionUtil.constantsMap(task, RowDataUtil::convertConstant);
FlinkDeleteFilter deletes = new FlinkDeleteFilter(task, tableSchema, projectedSchema);
CloseableIterable<RowData> iterable = deletes.filter(newIterable(task, deletes.requiredSchema(), idToConstant));
return iterable.iterator();
}
12
Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
@Test
public void testPositionDeletePlanningPath() throws IOException {
table.newAppend().appendFile(dataFile).commit();
List<Pair<CharSequence, Long>> deletes = Lists.newArrayList();
deletes.add(Pair.of(dataFile.path(), 0L));
deletes.add(Pair.of(dataFile.path(), 1L));
Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes);
table.newRowDelta().addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
List<FileScanTask> tasks;
try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
tasks = Lists.newArrayList(tasksIterable);
}
replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
FileScanTask task = tasks.get(0);
replacedert.replacedertEquals("Should have one delete file, file_path matches", 1, task.deletes().size());
}
12
Source : GenericReader.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private CloseableIterable<Record> openFile(FileScanTask task, Schema fileProjection) {
InputFile input = io.newInputFile(task.file().path().toString());
Map<Integer, ?> parreplacedion = ParreplacedionUtil.constantsMap(task, IdenreplacedyParreplacedionConverters::convertConstant);
switch(task.file().format()) {
case AVRO:
Avro.ReadBuilder avro = Avro.read(input).project(fileProjection).createReaderFunc(avroSchema -> DataReader.create(fileProjection, avroSchema, parreplacedion)).split(task.start(), task.length());
if (reuseContainers) {
avro.reuseContainers();
}
return avro.build();
case PARQUET:
Parquet.ReadBuilder parquet = Parquet.read(input).project(fileProjection).createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(fileProjection, fileSchema, parreplacedion)).split(task.start(), task.length()).filter(task.residual());
if (reuseContainers) {
parquet.reuseContainers();
}
return parquet.build();
case ORC:
Schema projectionWithoutConstantAndMetadataFields = TypeUtil.selectNot(fileProjection, Sets.union(parreplacedion.keySet(), MetadataColumns.metadataFieldIds()));
ORC.ReadBuilder orc = ORC.read(input).project(projectionWithoutConstantAndMetadataFields).createReaderFunc(fileSchema -> GenericOrcReader.buildReader(fileProjection, fileSchema, parreplacedion)).split(task.start(), task.length()).filter(task.residual());
return orc.build();
default:
throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", task.file().format().name(), task.file().path()));
}
}
12
Source : TestTableSerialization.java
with Apache License 2.0
from apache
with Apache License 2.0
from apache
private static Set<CharSequence> getFiles(Table table) throws IOException {
Set<CharSequence> files = Sets.newHashSet();
try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
for (FileScanTask task : tasks) {
files.add(task.file().path());
}
}
return files;
}
11
Source : PartitionTable.java
with Apache License 2.0
from trinodb
with Apache License 2.0
from trinodb
private Map<StructLikeWrapper, Parreplacedion> getParreplacedions(TableScan tableScan) {
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
Map<StructLikeWrapper, Parreplacedion> parreplacedions = new HashMap<>();
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
Types.StructType structType = fileScanTask.spec().parreplacedionType();
StructLike parreplacedionStruct = dataFile.parreplacedion();
StructLikeWrapper parreplacedionWrapper = StructLikeWrapper.forType(structType).set(parreplacedionStruct);
if (!parreplacedions.containsKey(parreplacedionWrapper)) {
Parreplacedion parreplacedion = new Parreplacedion(idToTypeMapping, nonParreplacedionPrimitiveColumns, parreplacedionStruct, dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(dataFile.lowerBounds()), toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
parreplacedions.put(parreplacedionWrapper, parreplacedion);
continue;
}
Parreplacedion parreplacedion = parreplacedions.get(parreplacedionWrapper);
parreplacedion.incrementFileCount();
parreplacedion.incrementRecordCount(dataFile.recordCount());
parreplacedion.incrementSize(dataFile.fileSizeInBytes());
parreplacedion.updateMin(toMap(dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
parreplacedion.updateMax(toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
parreplacedion.updateNullCount(dataFile.nullValueCounts());
}
return parreplacedions;
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
See More Examples