org.apache.iceberg.FileScanTask

Here are the examples of the java api org.apache.iceberg.FileScanTask taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

54 Examples 7

19 Source : IcebergSplitSource.java
with Apache License 2.0
from trinodb

private ConnectorSplit toIcebergSplit(FileScanTask task) {
    // TODO: We should leverage residual expression and convert that to TupleDomain.
    // The predicate here is used by readers for predicate push down at reader level,
    // so when we do not use residual expression, we are just wasting CPU cycles
    // on reader side evaluating a condition that we know will always be true.
    return new IcebergSplit(task.file().path().toString(), task.start(), task.length(), task.file().fileSizeInBytes(), task.file().format(), ImmutableList.of(), getParreplacedionKeys(task));
}

19 Source : TestSparkBaseDataReader.java
with Apache License 2.0
from apache

@Test
public void testClosureDuringIteration() throws IOException {
    Integer totalTasks = 2;
    Integer recordPerTask = 1;
    List<FileScanTask> tasks = createFileScanTasks(totalTasks, recordPerTask);
    replacedert.replacedertEquals(2, tasks.size());
    FileScanTask firstTask = tasks.get(0);
    FileScanTask secondTask = tasks.get(1);
    ClosureTrackingReader reader = new ClosureTrackingReader(tasks);
    // Total of 2 elements
    replacedert.replacedertTrue(reader.next());
    replacedert.replacedertFalse("First iter should not be closed on its last element", reader.isIteratorClosed(firstTask));
    replacedert.replacedertTrue(reader.next());
    replacedert.replacedertTrue("First iter should be closed after moving to second iter", reader.isIteratorClosed(firstTask));
    replacedert.replacedertFalse("Second iter should not be closed on its last element", reader.isIteratorClosed(secondTask));
    replacedert.replacedertFalse(reader.next());
    replacedert.replacedertTrue(reader.isIteratorClosed(firstTask));
    replacedert.replacedertTrue(reader.isIteratorClosed(secondTask));
}

19 Source : RowDataReader.java
with Apache License 2.0
from apache

protected CloseableIterable<InternalRow> open(FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
    CloseableIterable<InternalRow> iter;
    if (task.isDataTask()) {
        iter = newDataIterable(task.asDataTask(), readSchema);
    } else {
        InputFile location = getInputFile(task);
        Preconditions.checkNotNull(location, "Could not find InputFile replacedociated with FileScanTask");
        switch(task.file().format()) {
            case PARQUET:
                iter = newParquereplacederable(location, task, readSchema, idToConstant);
                break;
            case AVRO:
                iter = newAvroIterable(location, task, readSchema, idToConstant);
                break;
            case ORC:
                iter = newOrcIterable(location, task, readSchema, idToConstant);
                break;
            default:
                throw new UnsupportedOperationException("Cannot read unknown format: " + task.file().format());
        }
    }
    return iter;
}

19 Source : GenericReader.java
with Apache License 2.0
from apache

public CloseableIterable<Record> open(FileScanTask task) {
    DeleteFilter<Record> deletes = new GenericDeleteFilter(io, task, tableSchema, projection);
    Schema readSchema = deletes.requiredSchema();
    CloseableIterable<Record> records = openFile(task, readSchema);
    records = deletes.filter(records);
    records = applyResidual(records, readSchema, task.residual());
    return records;
}

19 Source : TableScanUtil.java
with Apache License 2.0
from apache

public static boolean hasDeletes(FileScanTask task) {
    return !task.deletes().isEmpty();
}

19 Source : PartitionUtil.java
with Apache License 2.0
from apache

public static Map<Integer, ?> constantsMap(FileScanTask task) {
    return constantsMap(task, (type, constant) -> constant);
}

18 Source : BaseRewriteDataFilesAction.java
with Apache License 2.0
from apache

private boolean isPartialFileScan(CombinedScanTask task) {
    if (task.files().size() == 1) {
        FileScanTask fileScanTask = task.files().iterator().next();
        return fileScanTask.file().fileSizeInBytes() != fileScanTask.length();
    } else {
        return false;
    }
}

17 Source : TestIcebergCTASWithPartition.java
with Apache License 2.0
from dremio

private void verifyParreplacedionValue(String tableFolder, Clreplaced expectedClreplaced, Object expectedValue) {
    Table table = new HadoopTables(new Configuration()).load(tableFolder);
    for (FileScanTask fileScanTask : table.newScan().planFiles()) {
        StructLike structLike = fileScanTask.file().parreplacedion();
        replacedert.replacedertEquals(structLike.get(0, expectedClreplaced), expectedValue);
    }
}

17 Source : Reader.java
with Apache License 2.0
from apache

@Override
public Statistics estimateStatistics() {
    // its a fresh table, no data
    if (table.currentSnapshot() == null) {
        return new Stats(0L, 0L);
    }
    // estimate stats using snapshot summary only for parreplacedioned tables (metadata tables are unparreplacedioned)
    if (!table.spec().isUnparreplacedioned() && filterExpression() == Expressions.alwaysTrue()) {
        long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
        return new Stats(SparkSchemaUtil.estimateSize(lazyType(), totalRecords), totalRecords);
    }
    long sizeInBytes = 0L;
    long numRows = 0L;
    for (CombinedScanTask task : tasks()) {
        for (FileScanTask file : task.files()) {
            sizeInBytes += file.length();
            numRows += file.file().recordCount();
        }
    }
    return new Stats(sizeInBytes, numRows);
}

17 Source : RowDataReader.java
with Apache License 2.0
from apache

private CloseableIterable<InternalRow> newAvroIterable(InputFile location, FileScanTask task, Schema projection, Map<Integer, ?> idToConstant) {
    Avro.ReadBuilder builder = Avro.read(location).reuseContainers().project(projection).split(task.start(), task.length()).createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));
    if (nameMapping != null) {
        builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }
    return builder.build();
}

17 Source : RowDataReader.java
with Apache License 2.0
from apache

private CloseableIterable<InternalRow> newParquereplacederable(InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
    Parquet.ReadBuilder builder = Parquet.read(location).reuseContainers().split(task.start(), task.length()).project(readSchema).createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive);
    if (nameMapping != null) {
        builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }
    return builder.build();
}

16 Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup

private CloseableIterable buildParquetReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) {
    Parquet.ReadBuilder builder = Parquet.read(file).createReaderFunc(messageType -> GenericParquetReaders.buildReader(schema, messageType)).project(schema).filter(task.residual()).split(task.start(), task.length());
    if (reuseContainers) {
        builder.reuseContainers();
    }
    return builder.build();
}

16 Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup

private CloseableIterable buildAvroReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) {
    Avro.ReadBuilder builder = Avro.read(file).createReaderFunc(DataReader::create).project(schema).split(task.start(), task.length());
    if (reuseContainers) {
        builder.reuseContainers();
    }
    return builder.build();
}

16 Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup

private CloseableIterable buildOrcReader(FileScanTask task, InputFile file, Schema schema, boolean reuseContainers) {
    ORC.ReadBuilder builder = ORC.read(file).project(schema).filter(task.residual()).split(task.start(), task.length());
    return builder.build();
}

16 Source : IcebergReaderFactory.java
with Apache License 2.0
from ExpediaGroup

public Iterable<Record> createReader(DataFile file, FileScanTask currentTask, InputFile inputFile, Schema tableSchema, boolean reuseContainers, Table table) {
    switch(file.format()) {
        case AVRO:
            return buildAvroReader(currentTask, inputFile, tableSchema, reuseContainers);
        case ORC:
            return buildOrcReader(currentTask, inputFile, tableSchema, reuseContainers);
        case PARQUET:
            return buildParquetReader(currentTask, inputFile, tableSchema, reuseContainers);
        case METADATA:
            return buildMetadataReader(table);
        default:
            throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", file.format().name(), file.path()));
    }
}

16 Source : IcebergTableWrapper.java
with Apache License 2.0
from dremio

// build the list of "distinct parreplacedion values" and the corresponding dataset splits.
// TODO: this should be optimised to handle deltas.
private void buildParreplacedionsAndSplits() throws IOException {
    ParreplacedionConverter parreplacedionConverter = new ParreplacedionConverter(schema);
    SplitConverter splitConverter = new SplitConverter(context, fs, schema, datasetColumnValueCounts);
    // map of distinct parreplacedion values.
    // iterate over all data files to get the parreplacedion values and them to the map.
    // TODO ravindra: this iteration requires reading all of the manifest files. This should go via
    // the dremio wrappers.
    for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
        List<ParreplacedionValue> parreplacedion = parreplacedionConverter.from(task);
        DatasetSplit split = splitConverter.from(task);
        parreplacedionChunkListing.put(parreplacedion, split);
        recordCount += task.file().recordCount();
    }
}

16 Source : BaseDataReader.java
with Apache License 2.0
from apache

protected InputFile getInputFile(FileScanTask task) {
    Preconditions.checkArgument(!task.isDataTask(), "Invalid task type");
    return inputFiles.get(task.file().path().toString());
}

16 Source : TestRewriteDataFilesAction.java
with Apache License 2.0
from apache

@Test
public void testRewriteLargeTableHasResiduals() throws IOException {
    // all records belong to the same parreplacedion
    List<String> records1 = Lists.newArrayList();
    List<String> records2 = Lists.newArrayList();
    List<Record> expected = Lists.newArrayList();
    for (int i = 0; i < 100; i++) {
        int id = i;
        String data = String.valueOf(i % 3);
        if (i % 2 == 0) {
            records1.add("(" + id + ",'" + data + "')");
        } else {
            records2.add("(" + id + ",'" + data + "')");
        }
        Record record = RECORD.copy();
        record.setField("id", id);
        record.setField("data", data);
        expected.add(record);
    }
    sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARreplacedIONED);
    sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARreplacedIONED);
    icebergTableUnParreplacedioned.refresh();
    CloseableIterable<FileScanTask> tasks = icebergTableUnParreplacedioned.newScan().ignoreResiduals().filter(Expressions.equal("data", "0")).planFiles();
    for (FileScanTask task : tasks) {
        replacedert.replacedertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
    }
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    replacedert.replacedertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
    Actions actions = Actions.forTable(icebergTableUnParreplacedioned);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute();
    replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
    replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    // replacedert the table records as expected.
    SimpleDataUtil.replacedertTableRecords(icebergTableUnParreplacedioned, expected);
}

16 Source : RowDataIterator.java
with Apache License 2.0
from apache

private CloseableIterable<RowData> newIterable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
    CloseableIterable<RowData> iter;
    if (task.isDataTask()) {
        throw new UnsupportedOperationException("Cannot read data task.");
    } else {
        switch(task.file().format()) {
            case PARQUET:
                iter = newParquereplacederable(task, schema, idToConstant);
                break;
            case AVRO:
                iter = newAvroIterable(task, schema, idToConstant);
                break;
            case ORC:
                iter = newOrcIterable(task, schema, idToConstant);
                break;
            default:
                throw new UnsupportedOperationException("Cannot read unknown format: " + task.file().format());
        }
    }
    return iter;
}

16 Source : DataIterator.java
with Apache License 2.0
from apache

InputFile getInputFile(FileScanTask task) {
    Preconditions.checkArgument(!task.isDataTask(), "Invalid task type");
    return inputFiles.get(task.file().path().toString());
}

16 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testEqualityDeletePlanningStats() throws IOException {
    table.newAppend().appendFile(dataFile).commit();
    List<Record> deletes = Lists.newArrayList();
    Schema deleteRowSchema = SCHEMA.select("data");
    Record delete = GenericRecord.create(deleteRowSchema);
    deletes.add(delete.copy("data", "d"));
    DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
    table.newRowDelta().addDeletes(posDeletes).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should have one delete file, data contains a matching value", 1, task.deletes().size());
}

16 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testEqualityDeletePlanningStatsSomeNullValuesWithSomeNullDeletes() throws IOException {
    table.newAppend().appendFile(// note that there are some nulls in the data column
    dataFile).commit();
    List<Record> deletes = Lists.newArrayList();
    Schema deleteRowSchema = SCHEMA.select("data");
    Record delete = GenericRecord.create(deleteRowSchema);
    // the data and delete ranges do not overlap, but both contain null
    deletes.add(delete.copy("data", null));
    deletes.add(delete.copy("data", "x"));
    DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
    table.newRowDelta().addDeletes(posDeletes).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should have one delete file, data and deletes have null values", 1, task.deletes().size());
}

16 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testEqualityDeletePlanningStatsAllNullValuesWithNoNullDeletes() throws IOException {
    table.newAppend().appendFile(// note that there are only nulls in the data column
    dataFileOnlyNulls).commit();
    List<Record> deletes = Lists.newArrayList();
    Schema deleteRowSchema = SCHEMA.select("data");
    Record delete = GenericRecord.create(deleteRowSchema);
    deletes.add(delete.copy("data", "d"));
    DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
    table.newRowDelta().addDeletes(posDeletes).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should have no delete files, data contains no null values", 0, task.deletes().size());
}

16 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testEqualityDeletePlanningStatsNullValueWithAllNullDeletes() throws IOException {
    table.newAppend().appendFile(dataFile).commit();
    List<Record> deletes = Lists.newArrayList();
    Schema deleteRowSchema = SCHEMA.select("data");
    Record delete = GenericRecord.create(deleteRowSchema);
    deletes.add(delete.copy("data", null));
    DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
    table.newRowDelta().addDeletes(posDeletes).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should have delete file, data contains a null value", 1, task.deletes().size());
}

16 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testEqualityDeletePlanningStatsNoNullValuesWithAllNullDeletes() throws IOException {
    table.newAppend().appendFile(// note that there are no nulls in the data column
    dataFileWithoutNulls).commit();
    List<Record> deletes = Lists.newArrayList();
    Schema deleteRowSchema = SCHEMA.select("data");
    Record delete = GenericRecord.create(deleteRowSchema);
    deletes.add(delete.copy("data", null));
    DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
    table.newRowDelta().addDeletes(posDeletes).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should have no delete files, data contains no null values", 0, task.deletes().size());
}

15 Source : EqualityDeleteRowReader.java
with Apache License 2.0
from apache

@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
    SparkDeleteFilter matches = new SparkDeleteFilter(task, tableSchema(), expectedSchema);
    // schema or rows returned by readers
    Schema requiredSchema = matches.requiredSchema();
    Map<Integer, ?> idToConstant = ParreplacedionUtil.constantsMap(task, RowDataReader::convertConstant);
    DataFile file = task.file();
    // update the current file for Spark's filename() function
    InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());
    return matches.findEqualityDeleteRows(open(task, requiredSchema, idToConstant)).iterator();
}

15 Source : BaseDataReader.java
with Apache License 2.0
from apache

/**
 * Base clreplaced of Spark readers.
 *
 * @param <T> is the Java clreplaced returned by this reader whose objects contain one or more rows.
 */
abstract clreplaced BaseDataReader<T> implements Closeable {

    private static final Logger LOG = LoggerFactory.getLogger(BaseDataReader.clreplaced);

    private final Iterator<FileScanTask> tasks;

    private final Map<String, InputFile> inputFiles;

    private CloseableIterator<T> currenreplacederator;

    private T current = null;

    private FileScanTask currentTask = null;

    BaseDataReader(CombinedScanTask task, FileIO io, EncryptionManager encryptionManager) {
        this.tasks = task.files().iterator();
        Map<String, ByteBuffer> keyMetadata = Maps.newHashMap();
        task.files().stream().flatMap(fileScanTask -> Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())).forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata()));
        Stream<EncryptedInputFile> encrypted = keyMetadata.entrySet().stream().map(entry -> EncryptedFiles.encryptedInput(io.newInputFile(entry.getKey()), entry.getValue()));
        // decrypt with the batch call to avoid multiple RPCs to a key server, if possible
        Iterable<InputFile> decryptedFiles = encryptionManager.decrypt(encrypted::iterator);
        Map<String, InputFile> files = Maps.newHashMapWithExpectedSize(task.files().size());
        decryptedFiles.forEach(decrypted -> files.putIfAbsent(decrypted.location(), decrypted));
        this.inputFiles = Collections.unmodifiableMap(files);
        this.currenreplacederator = CloseableIterator.empty();
    }

    public boolean next() throws IOException {
        try {
            while (true) {
                if (currenreplacederator.hasNext()) {
                    this.current = currenreplacederator.next();
                    return true;
                } else if (tasks.hasNext()) {
                    this.currenreplacederator.close();
                    this.currentTask = tasks.next();
                    this.currenreplacederator = open(currentTask);
                } else {
                    this.currenreplacederator.close();
                    return false;
                }
            }
        } catch (IOException | RuntimeException e) {
            if (currentTask != null && !currentTask.isDataTask()) {
                LOG.error("Error reading file: {}", getInputFile(currentTask).location(), e);
            }
            throw e;
        }
    }

    public T get() {
        return current;
    }

    abstract CloseableIterator<T> open(FileScanTask task);

    @Override
    public void close() throws IOException {
        InputFileBlockHolder.unset();
        // close the current iterator
        this.currenreplacederator.close();
        // exhaust the task iterator
        while (tasks.hasNext()) {
            tasks.next();
        }
    }

    protected InputFile getInputFile(FileScanTask task) {
        Preconditions.checkArgument(!task.isDataTask(), "Invalid task type");
        return inputFiles.get(task.file().path().toString());
    }

    protected InputFile getInputFile(String location) {
        return inputFiles.get(location);
    }

    protected static Object convertConstant(Type type, Object value) {
        if (value == null) {
            return null;
        }
        switch(type.typeId()) {
            case DECIMAL:
                return Decimal.apply((BigDecimal) value);
            case STRING:
                if (value instanceof Utf8) {
                    Utf8 utf8 = (Utf8) value;
                    return UTF8String.fromBytes(utf8.getBytes(), 0, utf8.getByteLength());
                }
                return UTF8String.fromString(value.toString());
            case FIXED:
                if (value instanceof byte[]) {
                    return value;
                } else if (value instanceof GenericData.Fixed) {
                    return ((GenericData.Fixed) value).bytes();
                }
                return ByteBuffers.toByteArray((ByteBuffer) value);
            case BINARY:
                return ByteBuffers.toByteArray((ByteBuffer) value);
            default:
        }
        return value;
    }
}

15 Source : RowDataIterator.java
with Apache License 2.0
from apache

private CloseableIterable<RowData> newAvroIterable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
    Avro.ReadBuilder builder = Avro.read(getInputFile(task)).reuseContainers().project(schema).split(task.start(), task.length()).createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant));
    if (nameMapping != null) {
        builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }
    return builder.build();
}

15 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testEqualityDeletePlanningStatsFilter() throws IOException {
    table.newAppend().appendFile(dataFile).commit();
    List<Record> deletes = Lists.newArrayList();
    Schema deleteRowSchema = table.schema().select("data");
    Record delete = GenericRecord.create(deleteRowSchema);
    deletes.add(delete.copy("data", "x"));
    deletes.add(delete.copy("data", "y"));
    deletes.add(delete.copy("data", "z"));
    DeleteFile posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema);
    table.newRowDelta().addDeletes(posDeletes).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should not have delete file, filtered by data column stats", 0, task.deletes().size());
}

14 Source : IcebergUtil.java
with Apache License 2.0
from trinodb

public static Map<Integer, String> getParreplacedionKeys(FileScanTask scanTask) {
    StructLike parreplacedion = scanTask.file().parreplacedion();
    ParreplacedionSpec spec = scanTask.spec();
    Map<ParreplacedionField, Integer> fieldToIndex = getIdenreplacedyParreplacedions(spec);
    Map<Integer, String> parreplacedionKeys = new HashMap<>();
    fieldToIndex.forEach((field, index) -> {
        int id = field.sourceId();
        org.apache.iceberg.types.Type type = spec.schema().findType(id);
        Clreplaced<?> javaClreplaced = type.typeId().javaClreplaced();
        Object value = parreplacedion.get(index, javaClreplaced);
        if (value == null) {
            parreplacedionKeys.put(id, null);
        } else {
            String parreplacedionValue;
            if (type.typeId() == FIXED || type.typeId() == BINARY) {
                // this is safe because Iceberg ParreplacedionData directly wraps the byte array
                parreplacedionValue = new String(((ByteBuffer) value).array(), UTF_8);
            } else {
                parreplacedionValue = value.toString();
            }
            parreplacedionKeys.put(id, parreplacedionValue);
        }
    });
    return Collections.unmodifiableMap(parreplacedionKeys);
}

14 Source : SparkBatchScan.java
with Apache License 2.0
from apache

@Override
public Statistics estimateStatistics() {
    // its a fresh table, no data
    if (table.currentSnapshot() == null) {
        return new Stats(0L, 0L);
    }
    // estimate stats using snapshot summary only for parreplacedioned tables (metadata tables are unparreplacedioned)
    if (!table.spec().isUnparreplacedioned() && filterExpressions.isEmpty()) {
        LOG.debug("using table metadata to estimate table statistics");
        long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
        Schema projectedSchema = expectedSchema != null ? expectedSchema : table.schema();
        return new Stats(SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(projectedSchema), totalRecords), totalRecords);
    }
    long sizeInBytes = 0L;
    long numRows = 0L;
    for (CombinedScanTask task : tasks()) {
        for (FileScanTask file : task.files()) {
            sizeInBytes += file.length();
            numRows += file.file().recordCount();
        }
    }
    return new Stats(sizeInBytes, numRows);
}

14 Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache

@Test
public void testCountMetricsCollectionForParquet() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
    Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
    for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
        DataFile file = task.file();
        replacedert.replacedertEquals(2, file.nullValueCounts().size());
        replacedert.replacedertEquals(2, file.valueCounts().size());
        replacedert.replacedertTrue(file.lowerBounds().isEmpty());
        replacedert.replacedertTrue(file.upperBounds().isEmpty());
    }
}

14 Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache

@Test
public void testFullMetricsCollectionForParquet() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full");
    Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
    for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
        DataFile file = task.file();
        replacedert.replacedertEquals(2, file.nullValueCounts().size());
        replacedert.replacedertEquals(2, file.valueCounts().size());
        replacedert.replacedertEquals(2, file.lowerBounds().size());
        replacedert.replacedertEquals(2, file.upperBounds().size());
    }
}

14 Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache

@Test
public void testNoMetricsCollectionForParquet() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
    Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
    for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
        DataFile file = task.file();
        replacedert.replacedertTrue(file.nullValueCounts().isEmpty());
        replacedert.replacedertTrue(file.valueCounts().isEmpty());
        replacedert.replacedertTrue(file.lowerBounds().isEmpty());
        replacedert.replacedertTrue(file.upperBounds().isEmpty());
    }
}

14 Source : TestRewriteDataFilesAction.java
with Apache License 2.0
from apache

@Test
public void testRewriteLargeTableHasResiduals() {
    ParreplacedionSpec spec = ParreplacedionSpec.builderFor(SCHEMA).build();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100");
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    // all records belong to the same parreplacedion
    List<ThreeColumnRecord> records = Lists.newArrayList();
    for (int i = 0; i < 100; i++) {
        records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i % 4)));
    }
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.clreplaced);
    writeDF(df);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles();
    for (FileScanTask task : tasks) {
        replacedert.replacedertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
    }
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    replacedert.replacedertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c3", "0")).execute();
    replacedert.replacedertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
    replacedert.replacedertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    table.refresh();
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1").as(Encoders.bean(ThreeColumnRecord.clreplaced)).collectAsList();
    replacedert.replacedertEquals("Rows must match", records, actualRecords);
}

14 Source : RowDataReader.java
with Apache License 2.0
from apache

@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
    SparkDeleteFilter deletes = new SparkDeleteFilter(task, tableSchema, expectedSchema);
    // schema or rows returned by readers
    Schema requiredSchema = deletes.requiredSchema();
    Map<Integer, ?> idToConstant = ParreplacedionUtil.constantsMap(task, RowDataReader::convertConstant);
    DataFile file = task.file();
    // update the current file for Spark's filename() function
    InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());
    return deletes.filter(open(task, requiredSchema, idToConstant)).iterator();
}

14 Source : RowDataIterator.java
with Apache License 2.0
from apache

private CloseableIterable<RowData> newParquereplacederable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
    Parquet.ReadBuilder builder = Parquet.read(getInputFile(task)).reuseContainers().split(task.start(), task.length()).project(schema).createReaderFunc(fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive).reuseContainers();
    if (nameMapping != null) {
        builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }
    return builder.build();
}

14 Source : Util.java
with Apache License 2.0
from apache

public static String[] blockLocations(FileIO io, CombinedScanTask task) {
    Set<String> locations = Sets.newHashSet();
    for (FileScanTask f : task.files()) {
        InputFile in = io.newInputFile(f.file().path().toString());
        if (in instanceof HadoopInputFile) {
            Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length()));
        }
    }
    return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE);
}

13 Source : IcebergSplitSource.java
with Apache License 2.0
from trinodb

@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorParreplacedionHandle parreplacedionHandle, int maxSize) {
    // TODO: move this to a background thread
    List<ConnectorSplit> splits = new ArrayList<>();
    Iterator<FileScanTask> iterator = limit(fileScanIterator, maxSize);
    while (iterator.hasNext()) {
        FileScanTask task = iterator.next();
        splits.add(toIcebergSplit(task));
    }
    return completedFuture(new ConnectorSplitBatch(splits, isFinished()));
}

13 Source : RowDataReader.java
with Apache License 2.0
from apache

private CloseableIterable<InternalRow> newOrcIterable(InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
    Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds()));
    ORC.ReadBuilder builder = ORC.read(location).project(readSchemaWithoutConstantAndMetadataFields).split(task.start(), task.length()).createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive);
    if (nameMapping != null) {
        builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }
    return builder.build();
}

13 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testPositionDeletePlanningPathFilter() throws IOException {
    table.newAppend().appendFile(dataFile).commit();
    List<Pair<CharSequence, Long>> deletes = Lists.newArrayList();
    deletes.add(Pair.of("some-other-file.parquet", 0L));
    deletes.add(Pair.of("some-other-file.parquet", 1L));
    Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes);
    table.newRowDelta().addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should not have delete file, filtered by file_path stats", 0, task.deletes().size());
}

12 Source : TableStatisticsMaker.java
with Apache License 2.0
from trinodb

private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
    if (tableHandle.getSnapshotId().isEmpty() || constraint.getSummary().isNone()) {
        return TableStatistics.empty();
    }
    TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.clreplaced::cast).intersect(tableHandle.getEnforcedPredicate());
    if (intersection.isNone()) {
        return TableStatistics.empty();
    }
    List<Types.NestedField> columns = icebergTable.schema().columns();
    Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
    List<ParreplacedionField> parreplacedionFields = icebergTable.spec().fields();
    Set<Integer> idenreplacedyParreplacedionIds = getIdenreplacedyParreplacedions(icebergTable.spec()).keySet().stream().map(ParreplacedionField::sourceId).collect(toSet());
    List<Types.NestedField> nonParreplacedionPrimitiveColumns = columns.stream().filter(column -> !idenreplacedyParreplacedionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
    List<Type> icebergParreplacedionTypes = parreplacedionTypes(parreplacedionFields, idToTypeMapping);
    List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
    Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toUnmodifiableMap(IcebergColumnHandle::getId, idenreplacedy()));
    ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
    for (int index = 0; index < parreplacedionFields.size(); index++) {
        ParreplacedionField field = parreplacedionFields.get(index);
        Type type = icebergParreplacedionTypes.get(index);
        idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toTrinoType(type, typeManager), type.typeId().javaClreplaced()));
    }
    Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
    TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
    Parreplacedion summary = null;
    try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
        for (FileScanTask fileScanTask : fileScanTasks) {
            DataFile dataFile = fileScanTask.file();
            if (!dataFileMatches(dataFile, constraint, idToTypeMapping, parreplacedionFields, idToDetails)) {
                continue;
            }
            if (summary == null) {
                summary = new Parreplacedion(idToTypeMapping, nonParreplacedionPrimitiveColumns, dataFile.parreplacedion(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
            } else {
                summary.incrementFileCount();
                summary.incrementRecordCount(dataFile.recordCount());
                summary.incrementSize(dataFile.fileSizeInBytes());
                updateSummaryMin(summary, parreplacedionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
                updateSummaryMax(summary, parreplacedionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
                summary.updateNullCount(dataFile.nullValueCounts());
                updateColumnSizes(summary, dataFile.columnSizes());
            }
        }
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    if (summary == null) {
        return TableStatistics.empty();
    }
    ImmutableMap.Builder<ColumnHandle, ColumnStatistics> columnHandleBuilder = ImmutableMap.builder();
    double recordCount = summary.getRecordCount();
    for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
        int fieldId = columnHandle.getId();
        ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
        Long nullCount = summary.getNullCounts().get(fieldId);
        if (nullCount != null) {
            columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
        }
        if (summary.getColumnSizes() != null) {
            Long columnSize = summary.getColumnSizes().get(fieldId);
            if (columnSize != null) {
                columnBuilder.setDataSize(Estimate.of(columnSize));
            }
        }
        Object min = summary.getMinValues().get(fieldId);
        Object max = summary.getMaxValues().get(fieldId);
        if (min instanceof Number && max instanceof Number) {
            columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
        }
        columnHandleBuilder.put(columnHandle, columnBuilder.build());
    }
    return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.build());
}

12 Source : TestIcebergPartitionData.java
with Apache License 2.0
from dremio

private void verifyParreplacedionValue(ParreplacedionSpec parreplacedionSpec, IcebergParreplacedionData parreplacedionData, String columnName, Clreplaced expectedClreplaced, Object expectedValue) throws Exception {
    String tableName = "icebergParreplacedionTest";
    File tableFolder = new File(folder.getRoot(), tableName);
    try {
        tableFolder.mkdir();
        File dataFile = new File(folder.getRoot(), "a.parquet");
        dataFile.createNewFile();
        DataFile d1 = DataFiles.builder(parreplacedionSpec).withInputFile(Files.localInput(dataFile)).withRecordCount(50).withFormat(FileFormat.PARQUET).withParreplacedion(parreplacedionData).build();
        IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(tableName, Path.of(tableFolder.toPath().toString()), SchemaConverter.fromIceberg(schema), Lists.newArrayList(columnName), new Configuration());
        committer.consumeData(Lists.newArrayList(d1));
        committer.commit();
        Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
        for (FileScanTask fileScanTask : table.newScan().planFiles()) {
            StructLike structLike = fileScanTask.file().parreplacedion();
            if (expectedClreplaced == ByteBuffer.clreplaced) {
                replacedert.replacedertEquals(structLike.get(0, expectedClreplaced).hashCode(), ByteBuffer.wrap((byte[]) expectedValue).hashCode());
            } else {
                replacedert.replacedertTrue(structLike.get(0, expectedClreplaced).equals(expectedValue));
            }
        }
    } finally {
        tableFolder.delete();
    }
}

12 Source : TestWriteMetricsConfig.java
with Apache License 2.0
from apache

@Test
public void testCustomMetricCollectionForParquet() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    ParreplacedionSpec spec = ParreplacedionSpec.unparreplacedioned();
    Map<String, String> properties = Maps.newHashMap();
    properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
    properties.put("write.metadata.metrics.column.id", "full");
    Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.clreplaced);
    df.select("id", "data").coalesce(1).write().format("iceberg").option(SparkWriteOptions.WRITE_FORMAT, "parquet").mode(SaveMode.Append).save(tableLocation);
    Schema schema = table.schema();
    Types.NestedField id = schema.findField("id");
    for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
        DataFile file = task.file();
        replacedert.replacedertEquals(2, file.nullValueCounts().size());
        replacedert.replacedertEquals(2, file.valueCounts().size());
        replacedert.replacedertEquals(1, file.lowerBounds().size());
        replacedert.replacedertTrue(file.lowerBounds().containsKey(id.fieldId()));
        replacedert.replacedertEquals(1, file.upperBounds().size());
        replacedert.replacedertTrue(file.upperBounds().containsKey(id.fieldId()));
    }
}

12 Source : RowDataIterator.java
with Apache License 2.0
from apache

private CloseableIterable<RowData> newOrcIterable(FileScanTask task, Schema schema, Map<Integer, ?> idToConstant) {
    Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds()));
    ORC.ReadBuilder builder = ORC.read(getInputFile(task)).project(readSchemaWithoutConstantAndMetadataFields).split(task.start(), task.length()).createReaderFunc(readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)).filter(task.residual()).caseSensitive(caseSensitive);
    if (nameMapping != null) {
        builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }
    return builder.build();
}

12 Source : RowDataIterator.java
with Apache License 2.0
from apache

@Override
protected CloseableIterator<RowData> openTaskIterator(FileScanTask task) {
    Schema parreplacedionSchema = TypeUtil.select(projectedSchema, task.spec().idenreplacedySourceIds());
    Map<Integer, ?> idToConstant = parreplacedionSchema.columns().isEmpty() ? ImmutableMap.of() : ParreplacedionUtil.constantsMap(task, RowDataUtil::convertConstant);
    FlinkDeleteFilter deletes = new FlinkDeleteFilter(task, tableSchema, projectedSchema);
    CloseableIterable<RowData> iterable = deletes.filter(newIterable(task, deletes.requiredSchema(), idToConstant));
    return iterable.iterator();
}

12 Source : TestDataFileIndexStatsFilters.java
with Apache License 2.0
from apache

@Test
public void testPositionDeletePlanningPath() throws IOException {
    table.newAppend().appendFile(dataFile).commit();
    List<Pair<CharSequence, Long>> deletes = Lists.newArrayList();
    deletes.add(Pair.of(dataFile.path(), 0L));
    deletes.add(Pair.of(dataFile.path(), 1L));
    Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes);
    table.newRowDelta().addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
    List<FileScanTask> tasks;
    try (CloseableIterable<FileScanTask> tasksIterable = table.newScan().planFiles()) {
        tasks = Lists.newArrayList(tasksIterable);
    }
    replacedert.replacedertEquals("Should produce one task", 1, tasks.size());
    FileScanTask task = tasks.get(0);
    replacedert.replacedertEquals("Should have one delete file, file_path matches", 1, task.deletes().size());
}

12 Source : GenericReader.java
with Apache License 2.0
from apache

private CloseableIterable<Record> openFile(FileScanTask task, Schema fileProjection) {
    InputFile input = io.newInputFile(task.file().path().toString());
    Map<Integer, ?> parreplacedion = ParreplacedionUtil.constantsMap(task, IdenreplacedyParreplacedionConverters::convertConstant);
    switch(task.file().format()) {
        case AVRO:
            Avro.ReadBuilder avro = Avro.read(input).project(fileProjection).createReaderFunc(avroSchema -> DataReader.create(fileProjection, avroSchema, parreplacedion)).split(task.start(), task.length());
            if (reuseContainers) {
                avro.reuseContainers();
            }
            return avro.build();
        case PARQUET:
            Parquet.ReadBuilder parquet = Parquet.read(input).project(fileProjection).createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(fileProjection, fileSchema, parreplacedion)).split(task.start(), task.length()).filter(task.residual());
            if (reuseContainers) {
                parquet.reuseContainers();
            }
            return parquet.build();
        case ORC:
            Schema projectionWithoutConstantAndMetadataFields = TypeUtil.selectNot(fileProjection, Sets.union(parreplacedion.keySet(), MetadataColumns.metadataFieldIds()));
            ORC.ReadBuilder orc = ORC.read(input).project(projectionWithoutConstantAndMetadataFields).createReaderFunc(fileSchema -> GenericOrcReader.buildReader(fileProjection, fileSchema, parreplacedion)).split(task.start(), task.length()).filter(task.residual());
            return orc.build();
        default:
            throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", task.file().format().name(), task.file().path()));
    }
}

12 Source : TestTableSerialization.java
with Apache License 2.0
from apache

private static Set<CharSequence> getFiles(Table table) throws IOException {
    Set<CharSequence> files = Sets.newHashSet();
    try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
        for (FileScanTask task : tasks) {
            files.add(task.file().path());
        }
    }
    return files;
}

11 Source : PartitionTable.java
with Apache License 2.0
from trinodb

private Map<StructLikeWrapper, Parreplacedion> getParreplacedions(TableScan tableScan) {
    try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
        Map<StructLikeWrapper, Parreplacedion> parreplacedions = new HashMap<>();
        for (FileScanTask fileScanTask : fileScanTasks) {
            DataFile dataFile = fileScanTask.file();
            Types.StructType structType = fileScanTask.spec().parreplacedionType();
            StructLike parreplacedionStruct = dataFile.parreplacedion();
            StructLikeWrapper parreplacedionWrapper = StructLikeWrapper.forType(structType).set(parreplacedionStruct);
            if (!parreplacedions.containsKey(parreplacedionWrapper)) {
                Parreplacedion parreplacedion = new Parreplacedion(idToTypeMapping, nonParreplacedionPrimitiveColumns, parreplacedionStruct, dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(dataFile.lowerBounds()), toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
                parreplacedions.put(parreplacedionWrapper, parreplacedion);
                continue;
            }
            Parreplacedion parreplacedion = parreplacedions.get(parreplacedionWrapper);
            parreplacedion.incrementFileCount();
            parreplacedion.incrementRecordCount(dataFile.recordCount());
            parreplacedion.incrementSize(dataFile.fileSizeInBytes());
            parreplacedion.updateMin(toMap(dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
            parreplacedion.updateMax(toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
            parreplacedion.updateNullCount(dataFile.nullValueCounts());
        }
        return parreplacedions;
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
}

See More Examples