From 86d8f3619258c1747fd2c76b0cb352f7534941c9 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Fri, 26 Dec 2025 12:48:18 -0500 Subject: [PATCH 1/9] init --- .../sdk/io/iceberg/IncrementalScanSource.java | 11 +- .../beam/sdk/io/iceberg/PartitionUtils.java | 54 +++ .../beam/sdk/io/iceberg/ReadFromTasks.java | 4 +- .../apache/beam/sdk/io/iceberg/ReadUtils.java | 211 +++++++++- .../beam/sdk/io/iceberg/ScanTaskReader.java | 5 +- .../sdk/io/iceberg/SerializableDataFile.java | 27 +- .../io/iceberg/SerializableDeleteFile.java | 387 ++++++++++++++++++ .../io/iceberg/cdc/ChangelogDescriptor.java | 67 +++ .../sdk/io/iceberg/cdc/ChangelogScanner.java | 218 ++++++++++ .../beam/sdk/io/iceberg/cdc/DeleteReader.java | 256 ++++++++++++ .../cdc/IncrementalChangelogSource.java | 148 +++++++ .../io/iceberg/cdc/ReadFromChangelogs.java | 279 +++++++++++++ .../sdk/io/iceberg/cdc/ReconcileChanges.java | 86 ++++ .../cdc/SerializableChangelogTask.java | 210 ++++++++++ 14 files changed, 1929 insertions(+), 34 deletions(-) create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogDescriptor.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/DeleteReader.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IncrementalScanSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IncrementalScanSource.java index 4df3eecb18e5..d09e1a1cb3ae 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IncrementalScanSource.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IncrementalScanSource.java @@ -42,11 +42,11 @@ * source creates a single range, while the unbounded implementation continuously polls for new * snapshots at the specified interval. */ -class IncrementalScanSource extends PTransform> { +public class IncrementalScanSource extends PTransform> { private static final Duration DEFAULT_POLL_INTERVAL = Duration.standardSeconds(60); - private final IcebergScanConfig scanConfig; + protected final IcebergScanConfig scanConfig; - IncrementalScanSource(IcebergScanConfig scanConfig) { + public IncrementalScanSource(IcebergScanConfig scanConfig) { this.scanConfig = scanConfig; } @@ -74,14 +74,15 @@ public PCollection expand(PBegin input) { } /** Continuously watches for new snapshots. */ - private PCollection>> unboundedSnapshots(PBegin input) { + protected PCollection>> unboundedSnapshots(PBegin input) { Duration pollInterval = MoreObjects.firstNonNull(scanConfig.getPollInterval(), DEFAULT_POLL_INTERVAL); return input.apply("Watch for Snapshots", new WatchForSnapshots(scanConfig, pollInterval)); } /** Creates a fixed snapshot range. */ - private PCollection>> boundedSnapshots(PBegin input, Table table) { + protected PCollection>> boundedSnapshots( + PBegin input, Table table) { checkStateNotNull( table.currentSnapshot().snapshotId(), "Table %s does not have any snapshots to read from.", diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java index 4b94663c64c5..8fbef14e3eb5 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java @@ -25,8 +25,15 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; import org.checkerframework.checker.nullness.qual.Nullable; class PartitionUtils { @@ -90,4 +97,51 @@ static PartitionSpec toPartitionSpec( return builder.build(); } + + /** + * Copied over from Apache Iceberg's PartitionUtil + */ + public static Map constantsMap( + PartitionSpec spec, ContentFile file, BiFunction convertConstant) { + StructLike partitionData = file.partition(); + + // use java.util.HashMap because partition data may contain null values + Map idToConstant = Maps.newHashMap(); + + // add first_row_id as _row_id + if (file.firstRowId() != null) { + idToConstant.put( + MetadataColumns.ROW_ID.fieldId(), + convertConstant.apply(Types.LongType.get(), file.firstRowId())); + } + + idToConstant.put( + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), + convertConstant.apply(Types.LongType.get(), file.fileSequenceNumber())); + + // add _file + idToConstant.put( + MetadataColumns.FILE_PATH.fieldId(), + convertConstant.apply(Types.StringType.get(), file.location())); + + // add _spec_id + idToConstant.put( + MetadataColumns.SPEC_ID.fieldId(), + convertConstant.apply(Types.IntegerType.get(), file.specId())); + + List partitionFields = spec.partitionType().fields(); + List fields = spec.fields(); + for (int pos = 0; pos < fields.size(); pos += 1) { + PartitionField field = fields.get(pos); + if (field.transform().isIdentity()) { + Object converted = + convertConstant.apply( + partitionFields.get(pos).type(), partitionData.get(pos, Object.class)); + idToConstant.put(field.sourceId(), converted); + } + } + + return idToConstant; + } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadFromTasks.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadFromTasks.java index 528b89c203bf..fea62356e431 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadFromTasks.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadFromTasks.java @@ -75,9 +75,7 @@ public void process( } FileScanTask task = fileScanTasks.get((int) l); Schema beamSchema = IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema()); - try (CloseableIterable fullIterable = - ReadUtils.createReader(task, table, scanConfig.getRequiredSchema())) { - CloseableIterable reader = ReadUtils.maybeApplyFilter(fullIterable, scanConfig); + try (CloseableIterable reader = ReadUtils.createReader(task, table, scanConfig)) { for (Record record : reader) { Row row = IcebergUtils.icebergRecordToBeamRow(beamSchema, record); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java index 4b127fcdef22..e918902ea47d 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import static org.apache.iceberg.util.SnapshotUtil.ancestorsOf; import java.util.Collection; @@ -28,16 +29,22 @@ import java.util.function.BiFunction; import java.util.stream.Collectors; import org.apache.beam.sdk.io.iceberg.IcebergIO.ReadRows.StartingStrategy; +import org.apache.beam.sdk.io.iceberg.cdc.DeleteReader; +import org.apache.beam.sdk.io.iceberg.cdc.SerializableChangelogTask; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; +import org.apache.iceberg.DeleteFile; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; +import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.DeleteFilter; import org.apache.iceberg.data.IdentityPartitionConverters; import org.apache.iceberg.data.InternalRecordWrapper; import org.apache.iceberg.data.Record; @@ -55,7 +62,6 @@ import org.apache.iceberg.parquet.ParquetReader; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PartitionUtil; import org.apache.iceberg.util.SnapshotUtil; import org.apache.parquet.HadoopReadOptions; import org.apache.parquet.ParquetReadOptions; @@ -72,16 +78,42 @@ public class ReadUtils { "parquet.read.support.class", "parquet.crypto.factory.class"); - static ParquetReader createReader(FileScanTask task, Table table, Schema schema) { - String filePath = task.file().path().toString(); + public static CloseableIterable createReader( + SerializableChangelogTask task, Table table, IcebergScanConfig scanConfig) { + return createReader( + table, + scanConfig, + checkStateNotNull(table.specs().get(task.getSpecId())), + task.getDataFile().createDataFile(table.specs()), + task.getStart(), + task.getLength(), + task.getExpression(table.schema())); + } + + public static CloseableIterable createReader( + ContentScanTask task, Table table, IcebergScanConfig scanConfig) { + return createReader( + table, scanConfig, task.spec(), task.file(), task.start(), task.length(), task.residual()); + } + + public static CloseableIterable createReader( + Table table, + IcebergScanConfig scanConfig, + PartitionSpec spec, + ContentFile file, + long start, + long length, + Expression residual) { + Schema schema = scanConfig.getRequiredSchema(); InputFile inputFile; try (FileIO io = table.io()) { EncryptedInputFile encryptedInput = - EncryptedFiles.encryptedInput(io.newInputFile(filePath), task.file().keyMetadata()); + EncryptedFiles.encryptedInput(io.newInputFile(file.location()), file.keyMetadata()); inputFile = table.encryption().decrypt(encryptedInput); } Map idToConstants = - ReadUtils.constantsMap(task, IdentityPartitionConverters::convertConstant, table.schema()); + ReadUtils.constantsMap( + spec, file, IdentityPartitionConverters::convertConstant, table.schema()); ParquetReadOptions.Builder optionsBuilder; if (inputFile instanceof HadoopInputFile) { @@ -96,37 +128,40 @@ static ParquetReader createReader(FileScanTask task, Table table, Schema } optionsBuilder = optionsBuilder - .withRange(task.start(), task.start() + task.length()) + .withRange(start, start + length) .withMaxAllocationInBytes(MAX_FILE_BUFFER_SIZE); @Nullable String nameMapping = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); NameMapping mapping = nameMapping != null ? NameMappingParser.fromJson(nameMapping) : NameMapping.empty(); - return new ParquetReader<>( - inputFile, - schema, - optionsBuilder.build(), - // TODO(ahmedabu98): Implement a Parquet-to-Beam Row reader, bypassing conversion to Iceberg - // Record - fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema, idToConstants), - mapping, - task.residual(), - false, - true); + ParquetReader records = + new ParquetReader<>( + inputFile, + schema, + optionsBuilder.build(), + // TODO(ahmedabu98): Implement a Parquet-to-Beam Row reader, bypassing conversion to + // Iceberg + // Record + fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema, idToConstants), + mapping, + residual, + false, + true); + return maybeApplyFilter(records, scanConfig); } static Map constantsMap( - FileScanTask task, + PartitionSpec spec, + ContentFile file, BiFunction converter, org.apache.iceberg.Schema schema) { - PartitionSpec spec = task.spec(); Set idColumns = spec.identitySourceIds(); org.apache.iceberg.Schema partitionSchema = TypeUtil.select(schema, idColumns); boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty(); if (projectsIdentityPartitionColumns) { - return PartitionUtil.constantsMap(task, converter); + return PartitionUtils.constantsMap(spec, file, converter); } else { return Collections.emptyMap(); } @@ -208,4 +243,138 @@ public static CloseableIterable maybeApplyFilter( } return iterable; } + + public static DeleteFilter genericDeleteFilter( + Table table, + IcebergScanConfig scanConfig, + String dataFilePath, + List deletes) { + return new BeamDeleteFilter( + table.io(), + dataFilePath, + scanConfig.getRequiredSchema(), + scanConfig.getProjectedSchema(), + deletes.stream() + .map(sdf -> sdf.createDeleteFile(table.specs(), table.sortOrders())) + .collect(Collectors.toList())); + } + + public static DeleteReader genericDeleteReader( + Table table, + IcebergScanConfig scanConfig, + String dataFilePath, + List deletes) { + return new BeamDeleteReader( + table.io(), + dataFilePath, + scanConfig.getRequiredSchema(), + scanConfig.getProjectedSchema(), + deletes.stream() + .map(sdf -> sdf.createDeleteFile(table.specs(), table.sortOrders())) + .collect(Collectors.toList())); + } + + public static class BeamDeleteFilter extends DeleteFilter { + private final FileIO io; + private final InternalRecordWrapper asStructLike; + + @SuppressWarnings("method.invocation") + public BeamDeleteFilter( + FileIO io, + String dataFilePath, + Schema tableSchema, + Schema projectedSchema, + List deleteFiles) { + super(dataFilePath, deleteFiles, tableSchema, projectedSchema); + this.io = io; + this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); + } + + // TODO: remove this (unused) + @SuppressWarnings("method.invocation") + public BeamDeleteFilter( + FileIO io, + SerializableChangelogTask scanTask, + Schema tableSchema, + Schema projectedSchema, + List deleteFiles) { + super(scanTask.getDataFile().getPath(), deleteFiles, tableSchema, projectedSchema); + this.io = io; + this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); + } + + // TODO: remove this (unused) + @SuppressWarnings("method.invocation") + public BeamDeleteFilter(FileIO io, ContentScanTask scanTask, List deleteFiles) { + super( + scanTask.file().location(), + deleteFiles, + scanTask.spec().schema(), + scanTask.spec().schema()); + this.io = io; + this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); + } + + @Override + protected StructLike asStructLike(Record record) { + return asStructLike.wrap(record); + } + + @Override + protected InputFile getInputFile(String location) { + return io.newInputFile(location); + } + } + + public static class BeamDeleteReader extends DeleteReader { + private final FileIO io; + private final InternalRecordWrapper asStructLike; + + @SuppressWarnings("method.invocation") + public BeamDeleteReader( + FileIO io, + String dataFilePath, + Schema tableSchema, + Schema projectedSchema, + List deleteFiles) { + super(dataFilePath, deleteFiles, tableSchema, projectedSchema); + this.io = io; + this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); + } + + // TODO: remove this (unused) + @SuppressWarnings("method.invocation") + public BeamDeleteReader( + FileIO io, + SerializableChangelogTask scanTask, + Schema tableSchema, + Schema projectedSchema, + List deleteFiles) { + super(scanTask.getDataFile().getPath(), deleteFiles, tableSchema, projectedSchema); + this.io = io; + this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); + } + + // TODO: remove this (unused) + @SuppressWarnings("method.invocation") + public BeamDeleteReader(FileIO io, ContentScanTask scanTask, List deleteFiles) { + super( + scanTask.file().location(), + deleteFiles, + scanTask.spec().schema(), + scanTask.spec().schema()); + this.io = io; + this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); + } + + @Override + protected StructLike asStructLike(Record record) { + return asStructLike.wrap(record); + } + + @Override + protected InputFile getInputFile(String location) { + return io.newInputFile(location); + } + } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java index 81ec229df70f..452012766e3c 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java @@ -122,7 +122,10 @@ public boolean advance() throws IOException { InputFile input = decryptor.getInputFile(fileTask); Map idToConstants = ReadUtils.constantsMap( - fileTask, IdentityPartitionConverters::convertConstant, requiredSchema); + fileTask.spec(), + fileTask.file(), + IdentityPartitionConverters::convertConstant, + requiredSchema); CloseableIterable iterable; switch (file.format()) { diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java index 5c994c3e5651..2db173d75947 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java @@ -54,13 +54,13 @@ */ @DefaultSchema(AutoValueSchema.class) @AutoValue -abstract class SerializableDataFile { +public abstract class SerializableDataFile { public static Builder builder() { return new AutoValue_SerializableDataFile.Builder(); } @SchemaFieldNumber("0") - abstract String getPath(); + public abstract String getPath(); @SchemaFieldNumber("1") abstract String getFileFormat(); @@ -69,7 +69,7 @@ public static Builder builder() { abstract long getRecordCount(); @SchemaFieldNumber("3") - abstract long getFileSizeInBytes(); + public abstract long getFileSizeInBytes(); @SchemaFieldNumber("4") abstract String getPartitionPath(); @@ -101,6 +101,15 @@ public static Builder builder() { @SchemaFieldNumber("13") abstract @Nullable Map getUpperBounds(); + @SchemaFieldNumber("14") + public abstract @Nullable Long getDataSequenceNumber(); + + @SchemaFieldNumber("15") + public abstract @Nullable Long getFileSequenceNumber(); + + @SchemaFieldNumber("16") + public abstract @Nullable Long getFirstRowId(); + @AutoValue.Builder abstract static class Builder { abstract Builder setPath(String path); @@ -131,6 +140,12 @@ abstract static class Builder { abstract Builder setUpperBounds(@Nullable Map upperBounds); + abstract Builder setDataSequenceNumber(@Nullable Long number); + + abstract Builder setFileSequenceNumber(@Nullable Long number); + + abstract Builder setFirstRowId(@Nullable Long id); + abstract SerializableDataFile build(); } @@ -138,7 +153,7 @@ abstract static class Builder { * Create a {@link SerializableDataFile} from a {@link DataFile} and its associated {@link * PartitionKey}. */ - static SerializableDataFile from(DataFile f, String partitionPath) { + public static SerializableDataFile from(DataFile f, String partitionPath) { return SerializableDataFile.builder() .setPath(f.path().toString()) @@ -155,6 +170,9 @@ static SerializableDataFile from(DataFile f, String partitionPath) { .setNanValueCounts(f.nanValueCounts()) .setLowerBounds(toByteArrayMap(f.lowerBounds())) .setUpperBounds(toByteArrayMap(f.upperBounds())) + .setDataSequenceNumber(f.dataSequenceNumber()) + .setFileSequenceNumber(f.fileSequenceNumber()) + .setFirstRowId(f.firstRowId()) .build(); } @@ -192,6 +210,7 @@ DataFile createDataFile(Map partitionSpecs) { .withFileSizeInBytes(getFileSizeInBytes()) .withMetrics(dataFileMetrics) .withSplitOffsets(getSplitOffsets()) + .withFirstRowId(getFirstRowId()) .build(); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java new file mode 100644 index 000000000000..9653f977805d --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java @@ -0,0 +1,387 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import com.google.auto.value.AutoValue; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldNumber; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Equivalence; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileMetadata; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SortOrder; +import org.checkerframework.checker.nullness.qual.Nullable; + +@DefaultSchema(AutoValueSchema.class) +@AutoValue +public abstract class SerializableDeleteFile { + public static SerializableDeleteFile.Builder builder() { + return new AutoValue_SerializableDeleteFile.Builder(); + } + + @SchemaFieldNumber("0") + public abstract FileContent getContentType(); + + @SchemaFieldNumber("1") + public abstract String getLocation(); + + @SchemaFieldNumber("2") + public abstract String getFileFormat(); + + @SchemaFieldNumber("3") + public abstract long getRecordCount(); + + @SchemaFieldNumber("4") + public abstract long getFileSizeInBytes(); + + @SchemaFieldNumber("5") + public abstract String getPartitionPath(); + + @SchemaFieldNumber("6") + public abstract int getPartitionSpecId(); + + @SchemaFieldNumber("7") + public abstract @Nullable Integer getSortOrderId(); + + @SchemaFieldNumber("8") + public abstract @Nullable List getEqualityFieldIds(); + + @SchemaFieldNumber("9") + public abstract @Nullable ByteBuffer getKeyMetadata(); + + @SchemaFieldNumber("10") + public abstract @Nullable List getSplitOffsets(); + + @SchemaFieldNumber("11") + public abstract @Nullable Map getColumnSizes(); + + @SchemaFieldNumber("12") + public abstract @Nullable Map getValueCounts(); + + @SchemaFieldNumber("13") + public abstract @Nullable Map getNullValueCounts(); + + @SchemaFieldNumber("14") + public abstract @Nullable Map getNanValueCounts(); + + @SchemaFieldNumber("15") + public abstract @Nullable Map getLowerBounds(); + + @SchemaFieldNumber("16") + public abstract @Nullable Map getUpperBounds(); + + @SchemaFieldNumber("17") + public abstract @Nullable Long getContentOffset(); + + @SchemaFieldNumber("18") + public abstract @Nullable Long getContentSizeInBytes(); + + @SchemaFieldNumber("19") + public abstract @Nullable String getReferencedDataFile(); + + @SchemaFieldNumber("20") + public abstract @Nullable Long getDataSequenceNumber(); + + @SchemaFieldNumber("21") + public abstract @Nullable Long getFileSequenceNumber(); + + @AutoValue.Builder + abstract static class Builder { + abstract Builder setContentType(FileContent content); + + abstract Builder setLocation(String path); + + abstract Builder setFileFormat(String fileFormat); + + abstract Builder setRecordCount(long recordCount); + + abstract Builder setFileSizeInBytes(long fileSizeInBytes); + + abstract Builder setPartitionPath(String partitionPath); + + abstract Builder setPartitionSpecId(int partitionSpec); + + abstract Builder setSortOrderId(int sortOrderId); + + abstract Builder setEqualityFieldIds(List equalityFieldIds); + + abstract Builder setKeyMetadata(ByteBuffer keyMetadata); + + abstract Builder setSplitOffsets(List splitOffsets); + + abstract Builder setColumnSizes(Map columnSizes); + + abstract Builder setValueCounts(Map valueCounts); + + abstract Builder setNullValueCounts(Map nullValueCounts); + + abstract Builder setNanValueCounts(Map nanValueCounts); + + abstract Builder setLowerBounds(@Nullable Map lowerBounds); + + abstract Builder setUpperBounds(@Nullable Map upperBounds); + + abstract Builder setContentOffset(@Nullable Long offset); + + abstract Builder setContentSizeInBytes(@Nullable Long sizeInBytes); + + abstract Builder setReferencedDataFile(@Nullable String dataFile); + + abstract Builder setDataSequenceNumber(@Nullable Long number); + + abstract Builder setFileSequenceNumber(@Nullable Long number); + + abstract SerializableDeleteFile build(); + } + + public static SerializableDeleteFile from(DeleteFile deleteFile, String partitionPath) { + return SerializableDeleteFile.builder() + .setLocation(deleteFile.location()) + .setFileFormat(deleteFile.format().name()) + .setFileSizeInBytes(deleteFile.fileSizeInBytes()) + .setPartitionPath(partitionPath) + .setPartitionSpecId(deleteFile.specId()) + .setRecordCount(deleteFile.recordCount()) + .setColumnSizes(deleteFile.columnSizes()) + .setValueCounts(deleteFile.valueCounts()) + .setNullValueCounts(deleteFile.nullValueCounts()) + .setNanValueCounts(deleteFile.nanValueCounts()) + .setLowerBounds(toByteArrayMap(deleteFile.lowerBounds())) + .setUpperBounds(toByteArrayMap(deleteFile.upperBounds())) + .setSplitOffsets(deleteFile.splitOffsets()) + .setKeyMetadata(deleteFile.keyMetadata()) + .setEqualityFieldIds(deleteFile.equalityFieldIds()) + .setSortOrderId(deleteFile.sortOrderId()) + .setContentOffset(deleteFile.contentOffset()) + .setContentSizeInBytes(deleteFile.contentSizeInBytes()) + .setReferencedDataFile(deleteFile.referencedDataFile()) + .setContentType(deleteFile.content()) + .setDataSequenceNumber(deleteFile.dataSequenceNumber()) + .setFileSequenceNumber(deleteFile.fileSequenceNumber()) + .build(); + } + + @SuppressWarnings("nullness") + public DeleteFile createDeleteFile( + Map partitionSpecs, @Nullable Map sortOrders) { + PartitionSpec partitionSpec = + checkStateNotNull( + partitionSpecs.get(getPartitionSpecId()), + "This DeleteFile was originally created with spec id '%s', " + + "but table only has spec ids: %s.", + getPartitionSpecId(), + partitionSpecs.keySet()); + + Metrics metrics = + new Metrics( + getRecordCount(), + getColumnSizes(), + getValueCounts(), + getNullValueCounts(), + getNanValueCounts(), + toByteBufferMap(getLowerBounds()), + toByteBufferMap(getUpperBounds())); + + FileMetadata.Builder deleteFileBuilder = + FileMetadata.deleteFileBuilder(partitionSpec) + .withPath(getLocation()) + .withFormat(getFileFormat()) + .withFileSizeInBytes(getFileSizeInBytes()) + .withRecordCount(getRecordCount()) + .withMetrics(metrics) + .withSplitOffsets(getSplitOffsets()) + .withEncryptionKeyMetadata(getKeyMetadata()) + .withPartitionPath(getPartitionPath()); + + switch (getContentType()) { + case POSITION_DELETES: + deleteFileBuilder = deleteFileBuilder.ofPositionDeletes(); + break; + case EQUALITY_DELETES: + int[] equalityFieldIds = + Objects.requireNonNullElse(getEqualityFieldIds(), new ArrayList()).stream() + .mapToInt(Integer::intValue) + .toArray(); + SortOrder sortOrder = SortOrder.unsorted(); + if (sortOrders != null) { + sortOrder = + checkStateNotNull( + sortOrders.get(getSortOrderId()), + "This DeleteFile was originally created with sort order id '%s', " + + "but table only has sort order ids: %s.", + getSortOrderId(), + sortOrders.keySet()); + } + deleteFileBuilder = + deleteFileBuilder.ofEqualityDeletes(equalityFieldIds).withSortOrder(sortOrder); + break; + default: + throw new IllegalStateException( + "Unexpected content type for DeleteFile: " + getContentType()); + } + + // needed for puffin files + if (getFileFormat().equalsIgnoreCase(FileFormat.PUFFIN.name())) { + deleteFileBuilder = + deleteFileBuilder + .withContentOffset(checkStateNotNull(getContentOffset())) + .withContentSizeInBytes(checkStateNotNull(getContentSizeInBytes())) + .withReferencedDataFile(checkStateNotNull(getReferencedDataFile())); + } + return deleteFileBuilder.build(); + } + + // ByteBuddyUtils has trouble converting Map value type ByteBuffer + // to byte[] and back to ByteBuffer, so we perform these conversions manually + // TODO(https://github.com/apache/beam/issues/32701) + private static @Nullable Map toByteArrayMap( + @Nullable Map input) { + if (input == null) { + return null; + } + Map output = new HashMap<>(input.size()); + for (Map.Entry e : input.entrySet()) { + output.put(e.getKey(), e.getValue().array()); + } + return output; + } + + private static @Nullable Map toByteBufferMap( + @Nullable Map input) { + if (input == null) { + return null; + } + Map output = new HashMap<>(input.size()); + for (Map.Entry e : input.entrySet()) { + output.put(e.getKey(), ByteBuffer.wrap(e.getValue())); + } + return output; + } + + @Override + public final boolean equals(@Nullable Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SerializableDeleteFile that = (SerializableDeleteFile) o; + return getContentType().equals(that.getContentType()) + && getLocation().equals(that.getLocation()) + && getFileFormat().equals(that.getFileFormat()) + && getRecordCount() == that.getRecordCount() + && getFileSizeInBytes() == that.getFileSizeInBytes() + && getPartitionPath().equals(that.getPartitionPath()) + && getPartitionSpecId() == that.getPartitionSpecId() + && Objects.equals(getSortOrderId(), that.getSortOrderId()) + && Objects.equals(getEqualityFieldIds(), that.getEqualityFieldIds()) + && Objects.equals(getKeyMetadata(), that.getKeyMetadata()) + && Objects.equals(getSplitOffsets(), that.getSplitOffsets()) + && Objects.equals(getColumnSizes(), that.getColumnSizes()) + && Objects.equals(getValueCounts(), that.getValueCounts()) + && Objects.equals(getNullValueCounts(), that.getNullValueCounts()) + && Objects.equals(getNanValueCounts(), that.getNanValueCounts()) + && mapEquals(getLowerBounds(), that.getLowerBounds()) + && mapEquals(getUpperBounds(), that.getUpperBounds()) + && Objects.equals(getContentOffset(), that.getContentOffset()) + && Objects.equals(getContentSizeInBytes(), that.getContentSizeInBytes()) + && Objects.equals(getReferencedDataFile(), that.getReferencedDataFile()) + && Objects.equals(getDataSequenceNumber(), that.getDataSequenceNumber()) + && Objects.equals(getFileSequenceNumber(), that.getFileSequenceNumber()); + } + + private static boolean mapEquals( + @Nullable Map map1, @Nullable Map map2) { + if (map1 == null && map2 == null) { + return true; + } else if (map1 == null || map2 == null) { + return false; + } + Equivalence byteArrayEquivalence = + new Equivalence() { + @Override + protected boolean doEquivalent(byte[] a, byte[] b) { + return Arrays.equals(a, b); + } + + @Override + protected int doHash(byte[] bytes) { + return Arrays.hashCode(bytes); + } + }; + + return Maps.difference(map1, map2, byteArrayEquivalence).areEqual(); + } + + @Override + public final int hashCode() { + int hashCode = + Objects.hash( + getContentType(), + getLocation(), + getFileFormat(), + getRecordCount(), + getFileSizeInBytes(), + getPartitionPath(), + getPartitionSpecId(), + getSortOrderId(), + getEqualityFieldIds(), + getKeyMetadata(), + getSplitOffsets(), + getColumnSizes(), + getValueCounts(), + getNullValueCounts(), + getNanValueCounts(), + getContentOffset(), + getContentSizeInBytes(), + getReferencedDataFile(), + getDataSequenceNumber(), + getFileSequenceNumber()); + hashCode = 31 * hashCode + computeMapByteHashCode(getLowerBounds()); + hashCode = 31 * hashCode + computeMapByteHashCode(getUpperBounds()); + return hashCode; + } + + private static int computeMapByteHashCode(@Nullable Map map) { + if (map == null) { + return 0; + } + int hashCode = 0; + for (Map.Entry entry : map.entrySet()) { + int keyHash = entry.getKey().hashCode(); + int valueHash = Arrays.hashCode(entry.getValue()); // content-based hash code + hashCode += keyHash ^ valueHash; + } + return hashCode; + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogDescriptor.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogDescriptor.java new file mode 100644 index 000000000000..2b35922a2270 --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogDescriptor.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.cdc; + +import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.NoSuchSchemaException; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.schemas.SchemaRegistry; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldNumber; + +@DefaultSchema(AutoValueSchema.class) +@AutoValue +public abstract class ChangelogDescriptor { + public static Builder builder() { + return new AutoValue_ChangelogDescriptor.Builder(); + } + + public static SchemaCoder coder() { + try { + return SchemaRegistry.createDefault().getSchemaCoder(ChangelogDescriptor.class); + } catch (NoSuchSchemaException e) { + throw new RuntimeException(e); + } + } + + @SchemaFieldNumber("0") + abstract String getTableIdentifierString(); + + @SchemaFieldNumber("1") + abstract long getStartSnapshotId(); + + @SchemaFieldNumber("2") + abstract long getEndSnapshotId(); + + @SchemaFieldNumber("3") + abstract int getChangeOrdinal(); + + @AutoValue.Builder + public abstract static class Builder { + abstract Builder setTableIdentifierString(String table); + + abstract Builder setStartSnapshotId(long snapshotId); + + abstract Builder setEndSnapshotId(long snapshotId); + + abstract Builder setChangeOrdinal(int ordinal); + + abstract ChangelogDescriptor build(); + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java new file mode 100644 index 000000000000..5e9583ded83d --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.cdc; + +import static org.apache.beam.sdk.io.iceberg.cdc.SerializableChangelogTask.Type.ADDED_ROWS; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.ListCoder; +import org.apache.beam.sdk.io.iceberg.IcebergScanConfig; +import org.apache.beam.sdk.io.iceberg.SnapshotInfo; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.iceberg.ChangelogScanTask; +import org.apache.iceberg.IncrementalChangelogScan; +import org.apache.iceberg.ScanTaskGroup; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterable; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ChangelogScanner + extends DoFn< + KV>, KV>> { + private static final Logger LOG = LoggerFactory.getLogger(ChangelogScanner.class); + private static final Counter totalChangelogScanTasks = + Metrics.counter(ChangelogScanner.class, "totalChangelogScanTasks"); + private static final Counter numAddedRowsScanTasks = + Metrics.counter(ChangelogScanner.class, "numAddedRowsScanTasks"); + private static final Counter numDeletedRowsScanTasks = + Metrics.counter(ChangelogScanner.class, "numDeletedRowsScanTasks"); + private static final Counter numDeletedDataFileScanTasks = + Metrics.counter(ChangelogScanner.class, "numDeletedDataFileScanTasks"); + public static final TupleTag>> + UNIFORM_CHANGES = new TupleTag<>(); + public static final TupleTag>> + MIXED_CHANGES = new TupleTag<>(); + public static final KvCoder> OUTPUT_CODER = + KvCoder.of(ChangelogDescriptor.coder(), ListCoder.of(SerializableChangelogTask.coder())); + private final IcebergScanConfig scanConfig; + + ChangelogScanner(IcebergScanConfig scanConfig) { + this.scanConfig = scanConfig; + } + + @ProcessElement + public void process(@Element KV> element, MultiOutputReceiver out) + throws IOException { + // TODO: use TableCache here + Table table = scanConfig.getTable(); + table.refresh(); + + List snapshots = element.getValue(); + SnapshotInfo startSnapshot = snapshots.get(0); + SnapshotInfo endSnapshot = snapshots.get(snapshots.size() - 1); + @Nullable Long fromSnapshotId = startSnapshot.getParentId(); + long toSnapshot = endSnapshot.getSnapshotId(); + + IncrementalChangelogScan scan = + table + .newIncrementalChangelogScan() + .toSnapshot(toSnapshot) + .project(scanConfig.getProjectedSchema()); + if (fromSnapshotId != null) { + scan = scan.fromSnapshotExclusive(fromSnapshotId); + } + @Nullable Expression filter = scanConfig.getFilter(); + if (filter != null) { + scan = scan.filter(filter); + } + LOG.info("Planning to scan snapshot range [{}, {}]", fromSnapshotId, toSnapshot); + + createAndOutputReadTasks( + scan, startSnapshot, endSnapshot, SerializableTable.copyOf(table), out); + } + + private void createAndOutputReadTasks( + IncrementalChangelogScan scan, + SnapshotInfo startSnapshot, + SnapshotInfo endSnapshot, + Table table, + MultiOutputReceiver multiOutputReceiver) + throws IOException { + int numAddedRowsTasks = 0; + int numDeletedRowsTasks = 0; + int numDeletedFileTasks = 0; + + Map cachedSnapshotTimestamps = new HashMap<>(); + // Maintain the same scan task groupings produced by Iceberg's binpacking, for + // better work load distribution among readers. + // Also allows the user to control by setting a `read.split.target-size`: + // https://iceberg.apache.org/docs/latest/configuration/#read-properties + Map>> changelogScanTaskGroups = new HashMap<>(); + + // keep track of the types of changes in each ordinal + Map> changeTypesPerOrdinal = new HashMap<>(); + + try (CloseableIterable> scanTaskGroups = scan.planTasks()) { + for (ScanTaskGroup scanTaskGroup : scanTaskGroups) { + Map> ordinalGroups = new HashMap<>(); + + for (ChangelogScanTask changelogScanTask : scanTaskGroup.tasks()) { + long snapshotId = changelogScanTask.commitSnapshotId(); + long timestampMillis = + cachedSnapshotTimestamps.computeIfAbsent( + snapshotId, (snapId) -> table.snapshot(snapId).timestampMillis()); + int ordinal = changelogScanTask.changeOrdinal(); + + SerializableChangelogTask task = + SerializableChangelogTask.from(changelogScanTask, timestampMillis); + ordinalGroups.computeIfAbsent(ordinal, (unused) -> new ArrayList<>()).add(task); + + changeTypesPerOrdinal + .computeIfAbsent(ordinal, (o) -> new HashSet<>()) + .add(task.getType()); + + // metric gathering + switch (task.getType()) { + case ADDED_ROWS: + numAddedRowsTasks++; + break; + case DELETED_ROWS: + numDeletedRowsTasks++; + break; + case DELETED_FILE: + numDeletedFileTasks++; + break; + } + } + + for (Map.Entry> ordinalGroup : + ordinalGroups.entrySet()) { + changelogScanTaskGroups + .computeIfAbsent(ordinalGroup.getKey(), (unused) -> new ArrayList<>()) + .add(ordinalGroup.getValue()); + } + } + } + + int totalTasks = numAddedRowsTasks + numDeletedRowsTasks + numDeletedFileTasks; + totalChangelogScanTasks.inc(totalTasks); + numAddedRowsScanTasks.inc(numAddedRowsTasks); + numDeletedRowsScanTasks.inc(numDeletedRowsTasks); + numDeletedDataFileScanTasks.inc(numDeletedFileTasks); + + LOG.info( + "Snapshots [{}, {}] produced {} tasks:\n\t{} AddedRowsScanTasks\n\t{} DeletedRowsScanTasks\n\t{} DeletedDataFileScanTasks", + startSnapshot.getSnapshotId(), + endSnapshot.getSnapshotId(), + totalTasks, + numAddedRowsTasks, + numDeletedRowsTasks, + numDeletedFileTasks); + + for (Map.Entry>> taskGroups : + changelogScanTaskGroups.entrySet()) { + int ordinal = taskGroups.getKey(); + ChangelogDescriptor descriptor = + ChangelogDescriptor.builder() + .setTableIdentifierString(checkStateNotNull(startSnapshot.getTableIdentifierString())) + .setStartSnapshotId(startSnapshot.getSnapshotId()) + .setEndSnapshotId(endSnapshot.getSnapshotId()) + .setChangeOrdinal(ordinal) + .build(); + + for (List subgroup : taskGroups.getValue()) { + Instant timestamp = Instant.ofEpochMilli(subgroup.get(0).getTimestampMillis()); + KV> output = + KV.of(descriptor, subgroup); + + // Determine where each ordinal's tasks will go, based on the type of changes: + // 1. If an ordinal's changes are uniform (i.e. all inserts or all deletes), they should be + // processed directly in the fast path. + // 2. If an ordinal's changes are mixed (i.e. some inserts and some deletes), they will need + // more careful processing to determine if any updates have occurred. + Set changeTypes = + checkStateNotNull(changeTypesPerOrdinal.get(ordinal)); + TupleTag>> outputTag; + if (changeTypes.contains(ADDED_ROWS) && changeTypes.size() > 1) { // added and deleted rows + outputTag = MIXED_CHANGES; + } else { // all added or all deleted rows + outputTag = UNIFORM_CHANGES; + } + + multiOutputReceiver.get(outputTag).outputWithTimestamp(output, timestamp); + } + } + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/DeleteReader.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/DeleteReader.java new file mode 100644 index 000000000000..e85bac6136a3 --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/DeleteReader.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.cdc; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Predicate; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimaps; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.apache.iceberg.Accessor; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.BaseDeleteLoader; +import org.apache.iceberg.data.DeleteLoader; +import org.apache.iceberg.deletes.PositionDeleteIndex; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructLikeSet; +import org.apache.iceberg.util.StructProjection; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Reads a {@link org.apache.iceberg.DataFile} and returns records marked deleted by the given + * {@link DeleteFile}s. + * + *

This is mostly a copy of {@link org.apache.iceberg.data.DeleteFilter}, but flipping the logic + * to output deleted records instead of filtering them out. + */ +public abstract class DeleteReader { + private static final Logger LOG = LoggerFactory.getLogger(DeleteReader.class); + + private final String filePath; + private final List posDeletes; + private final List eqDeletes; + private final Schema requiredSchema; + private final Accessor posAccessor; + private volatile @Nullable DeleteLoader deleteLoader = null; + private @Nullable PositionDeleteIndex deleteRowPositions = null; + private @Nullable List> isInDeleteSets = null; + + protected DeleteReader( + String filePath, + List deletes, + Schema tableSchema, + Schema expectedSchema, + boolean needRowPosCol) { + this.filePath = filePath; + + ImmutableList.Builder posDeleteBuilder = ImmutableList.builder(); + ImmutableList.Builder eqDeleteBuilder = ImmutableList.builder(); + for (DeleteFile delete : deletes) { + switch (delete.content()) { + case POSITION_DELETES: + LOG.debug("Adding position delete file {} to reader", delete.location()); + posDeleteBuilder.add(delete); + break; + case EQUALITY_DELETES: + LOG.debug("Adding equality delete file {} to reader", delete.location()); + eqDeleteBuilder.add(delete); + break; + default: + throw new UnsupportedOperationException( + "Unknown delete file content: " + delete.content()); + } + } + + this.posDeletes = posDeleteBuilder.build(); + this.eqDeletes = eqDeleteBuilder.build(); + this.requiredSchema = + fileProjection(tableSchema, expectedSchema, posDeletes, eqDeletes, needRowPosCol); + this.posAccessor = requiredSchema.accessorForField(MetadataColumns.ROW_POSITION.fieldId()); + } + + protected DeleteReader( + String filePath, List deletes, Schema tableSchema, Schema requestedSchema) { + this(filePath, deletes, tableSchema, requestedSchema, true); + } + + public Schema requiredSchema() { + return requiredSchema; + } + + protected abstract StructLike asStructLike(T record); + + protected abstract InputFile getInputFile(String location); + + protected InputFile loadInputFile(DeleteFile deleteFile) { + return getInputFile(deleteFile.location()); + } + + protected long pos(T record) { + return (Long) posAccessor.get(asStructLike(record)); + } + + protected DeleteLoader newDeleteLoader() { + return new BaseDeleteLoader(this::loadInputFile); + } + + private DeleteLoader deleteLoader() { + if (deleteLoader == null) { + synchronized (this) { + if (deleteLoader == null) { + this.deleteLoader = newDeleteLoader(); + } + } + } + + return deleteLoader; + } + + public CloseableIterable read(CloseableIterable records) { + return applyEqDeletes(applyPosDeletes(records)); + } + + private List> applyEqDeletes() { + if (isInDeleteSets != null) { + return isInDeleteSets; + } + + isInDeleteSets = Lists.newArrayList(); + if (eqDeletes.isEmpty()) { + return isInDeleteSets; + } + + Multimap, DeleteFile> filesByDeleteIds = + Multimaps.newMultimap(Maps.newHashMap(), Lists::newArrayList); + for (DeleteFile delete : eqDeletes) { + filesByDeleteIds.put(Sets.newHashSet(delete.equalityFieldIds()), delete); + } + + for (Map.Entry, Collection> entry : + filesByDeleteIds.asMap().entrySet()) { + Set ids = entry.getKey(); + Iterable deletes = entry.getValue(); + + Schema deleteSchema = TypeUtil.select(requiredSchema, ids); + + // a projection to select and reorder fields of the file schema to match the delete rows + StructProjection projectRow = StructProjection.create(requiredSchema, deleteSchema); + + StructLikeSet deleteSet = deleteLoader().loadEqualityDeletes(deletes, deleteSchema); + Predicate isInDeleteSet = + record -> deleteSet.contains(projectRow.wrap(asStructLike(record))); + checkStateNotNull(isInDeleteSets).add(isInDeleteSet); + } + + return checkStateNotNull(isInDeleteSets); + } + + private CloseableIterable applyEqDeletes(CloseableIterable records) { + Predicate isEqDeleted = applyEqDeletes().stream().reduce(Predicate::or).orElse(t -> false); + + return CloseableIterable.filter(records, isEqDeleted); + } + + public PositionDeleteIndex deletedRowPositions() { + if (deleteRowPositions == null && !posDeletes.isEmpty()) { + deleteRowPositions = deleteLoader().loadPositionDeletes(posDeletes, filePath); + } + + return checkStateNotNull(deleteRowPositions); + } + + private CloseableIterable applyPosDeletes(CloseableIterable records) { + if (posDeletes.isEmpty()) { + return records; + } + + PositionDeleteIndex positionIndex = deletedRowPositions(); + Predicate isDeleted = record -> positionIndex.isDeleted(pos(record)); + return CloseableIterable.filter(records, isDeleted); + } + + private static Schema fileProjection( + Schema tableSchema, + Schema requestedSchema, + List posDeletes, + List eqDeletes, + boolean needRowPosCol) { + if (posDeletes.isEmpty() && eqDeletes.isEmpty()) { + return requestedSchema; + } + + Set requiredIds = Sets.newLinkedHashSet(); + if (needRowPosCol && !posDeletes.isEmpty()) { + requiredIds.add(MetadataColumns.ROW_POSITION.fieldId()); + } + + for (DeleteFile eqDelete : eqDeletes) { + requiredIds.addAll(eqDelete.equalityFieldIds()); + } + + Set missingIds = + Sets.newLinkedHashSet( + Sets.difference(requiredIds, TypeUtil.getProjectedIds(requestedSchema))); + + if (missingIds.isEmpty()) { + return requestedSchema; + } + + // TODO: support adding nested columns. this will currently fail when finding nested columns to + // add + List columns = Lists.newArrayList(requestedSchema.columns()); + for (int fieldId : missingIds) { + if (fieldId == MetadataColumns.ROW_POSITION.fieldId() + || fieldId == MetadataColumns.IS_DELETED.fieldId()) { + continue; // add _pos and _deleted at the end + } + + Types.NestedField field = tableSchema.asStruct().field(fieldId); + Preconditions.checkArgument(field != null, "Cannot find required field for ID %s", fieldId); + + columns.add(field); + } + + if (missingIds.contains(MetadataColumns.ROW_POSITION.fieldId())) { + columns.add(MetadataColumns.ROW_POSITION); + } + + if (missingIds.contains(MetadataColumns.IS_DELETED.fieldId())) { + columns.add(MetadataColumns.IS_DELETED); + } + + return new Schema(columns); + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java new file mode 100644 index 000000000000..cfe60631158c --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.cdc; + +import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.MIXED_CHANGES; +import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIFORM_CHANGES; +import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_DELETES; +import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_INSERTS; +import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.UNIFORM_ROWS; +import static org.apache.beam.sdk.io.iceberg.cdc.ReconcileChanges.DELETES; +import static org.apache.beam.sdk.io.iceberg.cdc.ReconcileChanges.INSERTS; + +import java.util.List; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.io.iceberg.IcebergScanConfig; +import org.apache.beam.sdk.io.iceberg.IcebergUtils; +import org.apache.beam.sdk.io.iceberg.IncrementalScanSource; +import org.apache.beam.sdk.io.iceberg.SnapshotInfo; +import org.apache.beam.sdk.transforms.Flatten; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Redistribute; +import org.apache.beam.sdk.transforms.Reify; +import org.apache.beam.sdk.transforms.join.CoGroupByKey; +import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple; +import org.apache.beam.sdk.transforms.windowing.GlobalWindows; +import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PBegin; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionList; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TimestampedValue; +import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; + +public class IncrementalChangelogSource extends IncrementalScanSource { + public IncrementalChangelogSource(IcebergScanConfig scanConfig) { + super(scanConfig); + } + + @Override + public PCollection expand(PBegin input) { + Table table = + scanConfig + .getCatalogConfig() + .catalog() + .loadTable(TableIdentifier.parse(scanConfig.getTableIdentifier())); + + PCollection>> snapshots = + MoreObjects.firstNonNull(scanConfig.getStreaming(), false) + ? unboundedSnapshots(input) + : boundedSnapshots(input, table); + + // scan each interval of snapshots and create groups of changelog tasks + PCollectionTuple changelogTasks = + snapshots + .apply(Redistribute.byKey()) + .apply( + "Create Changelog Tasks", + ParDo.of(new ChangelogScanner(scanConfig)) + .withOutputTags(UNIFORM_CHANGES, TupleTagList.of(MIXED_CHANGES))); + + // for changelog ordinal groups that have UNIFORM changes (i.e. all deletes, or all inserts), + // take the fast approach of just reading and emitting CDC records. + PCollection fastPathCdcRows = + processUniformChanges( + changelogTasks.get(UNIFORM_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); + + // changelog ordinal groups that have MIXED changes (i.e. some deletes and some inserts) + // will need extra processing to identify any updates + PCollection slowPathCdcRows = + processMixedChanges( + changelogTasks.get(MIXED_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); + + // Merge UNIFORM and MIXED outputs + return PCollectionList.of(fastPathCdcRows).and(slowPathCdcRows).apply(Flatten.pCollections()); + } + + private PCollection processUniformChanges( + PCollection>> uniformChangelogs) { + return uniformChangelogs + .apply(Redistribute.arbitrarily()) + .apply( + "Read Uniform Changes", + ParDo.of(ReadFromChangelogs.of(scanConfig)) + .withOutputTags(UNIFORM_ROWS, TupleTagList.empty())) + .get(UNIFORM_ROWS) + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); + } + + private PCollection processMixedChanges( + PCollection>> mixedChangelogs) { + PCollectionTuple mixedCdcKeyedRows = + mixedChangelogs + .apply(Redistribute.arbitrarily()) + .apply( + "Read Mixed Changes", + ParDo.of(ReadFromChangelogs.withKeyedOutput(scanConfig)) + .withOutputTags(KEYED_INSERTS, TupleTagList.of(KEYED_DELETES))); + + // prior to CoGBK, set a windowing strategy to maintain the earliest timestamp in the window + Window>> windowingStrategy = + Window.>>into(new GlobalWindows()) + .withTimestampCombiner(TimestampCombiner.EARLIEST); + + // preserve the element's timestamp by moving it into the value + KvCoder keyedOutputCoder = ReadFromChangelogs.keyedOutputCoder(scanConfig); + PCollection>> keyedInsertsWithTimestamps = + mixedCdcKeyedRows + .get(KEYED_INSERTS) + .setCoder(keyedOutputCoder) + .apply(Reify.timestampsInValue()) + .apply(windowingStrategy); + PCollection>> keyedDeletesWithTimestamps = + mixedCdcKeyedRows + .get(KEYED_DELETES) + .setCoder(keyedOutputCoder) + .apply(Reify.timestampsInValue()) + .apply(windowingStrategy); + + // CoGroup by record ID and emit any (DELETE + INSERT) pairs as updates: (UPDATE_BEFORE, + // UPDATE_AFTER) + return KeyedPCollectionTuple.of(INSERTS, keyedInsertsWithTimestamps) + .and(DELETES, keyedDeletesWithTimestamps) + .apply(CoGroupByKey.create()) + .apply("Reconcile Inserts and Deletes", ParDo.of(new ReconcileChanges())) + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java new file mode 100644 index 000000000000..7e4976747c4e --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.cdc; + +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.icebergSchemaToBeamSchema; + +import java.io.IOException; +import java.util.List; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.io.iceberg.IcebergScanConfig; +import org.apache.beam.sdk.io.iceberg.IcebergUtils; +import org.apache.beam.sdk.io.iceberg.ReadUtils; +import org.apache.beam.sdk.io.iceberg.SerializableDeleteFile; +import org.apache.beam.sdk.io.range.OffsetRange; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.DeleteFilter; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructProjection; +import org.joda.time.Instant; + +@DoFn.BoundedPerElement +public class ReadFromChangelogs + extends DoFn>, OutT> { + public static final TupleTag UNIFORM_ROWS = new TupleTag<>(); + public static final TupleTag> KEYED_INSERTS = new TupleTag<>(); + public static final TupleTag> KEYED_DELETES = new TupleTag<>(); + + private final Counter numAddedRowsScanTasksCompleted = + Metrics.counter(ReadFromChangelogs.class, "numAddedRowsScanTasksCompleted"); + private final Counter numDeletedRowsScanTasksCompleted = + Metrics.counter(ReadFromChangelogs.class, "numDeletedRowsScanTasksCompleted"); + private final Counter numDeletedDataFileScanTasksCompleted = + Metrics.counter(ReadFromChangelogs.class, "numDeletedDataFileScanTasksCompleted"); + + private final IcebergScanConfig scanConfig; + private final boolean keyedOutput; + private transient StructProjection recordIdProjection; + private transient org.apache.iceberg.Schema recordIdSchema; + private final Schema beamRowSchema; + private final Schema rowIdWithOrdinalBeamSchema; + private static final String ORDINAL_FIELD = "__beam__changelog__ordinal__"; + + private ReadFromChangelogs(IcebergScanConfig scanConfig, boolean keyedOutput) { + this.scanConfig = scanConfig; + this.keyedOutput = keyedOutput; + + this.beamRowSchema = icebergSchemaToBeamSchema(scanConfig.getProjectedSchema()); + org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); + this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); + this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); + + Schema rowIdBeamSchema = icebergSchemaToBeamSchema(recordIdSchema); + List fields = + ImmutableList.builder() + .add(Schema.Field.of(ORDINAL_FIELD, Schema.FieldType.INT32)) + .addAll(rowIdBeamSchema.getFields()) + .build(); + this.rowIdWithOrdinalBeamSchema = new Schema(fields); + } + + static ReadFromChangelogs of(IcebergScanConfig scanConfig) { + return new ReadFromChangelogs<>(scanConfig, false); + } + + static ReadFromChangelogs> withKeyedOutput(IcebergScanConfig scanConfig) { + return new ReadFromChangelogs<>(scanConfig, true); + } + + /** + * Determines the keyed output coder, which depends on the requested projected schema and the + * schema's identifier fields. + */ + static KvCoder keyedOutputCoder(IcebergScanConfig scanConfig) { + org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); + org.apache.iceberg.Schema recordIdSchema = + recordSchema.select(recordSchema.identifierFieldNames()); + Schema rowIdBeamSchema = icebergSchemaToBeamSchema(recordIdSchema); + List fields = + ImmutableList.builder() + .add(Schema.Field.of(ORDINAL_FIELD, Schema.FieldType.INT32)) + .addAll(rowIdBeamSchema.getFields()) + .build(); + Schema rowIdWithOrdinalBeamSchema = new Schema(fields); + return KvCoder.of( + SchemaCoder.of(rowIdWithOrdinalBeamSchema), + SchemaCoder.of(icebergSchemaToBeamSchema(recordSchema))); + } + + @Setup + public void setup() { + // StructProjection is not serializable, so we need to recompute it when the DoFn gets + // deserialized + org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); + this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); + this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); + } + + @ProcessElement + public void process( + @Element KV> element, + RestrictionTracker tracker, + MultiOutputReceiver out) + throws IOException { + // TODO: use TableCache + Table table = scanConfig.getTable(); + table.refresh(); + + List tasks = element.getValue(); + + for (long l = tracker.currentRestriction().getFrom(); + l < tracker.currentRestriction().getTo(); + l++) { + if (!tracker.tryClaim(l)) { + return; + } + + SerializableChangelogTask task = tasks.get((int) l); + switch (task.getType()) { + case ADDED_ROWS: + processAddedRowsTask(task, table, out); + break; + case DELETED_ROWS: + processDeletedRowsTask(task, table, out); + break; + case DELETED_FILE: + processDeletedFileTask(task, table, out); + break; + } + } + } + + private void processAddedRowsTask( + SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) + throws IOException { + try (CloseableIterable fullIterable = ReadUtils.createReader(task, table, scanConfig)) { + DeleteFilter deleteFilter = + ReadUtils.genericDeleteFilter( + table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes()); + CloseableIterable filtered = deleteFilter.filter(fullIterable); + + for (Record rec : filtered) { + outputRecord( + rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_INSERTS); + } + } + numAddedRowsScanTasksCompleted.inc(); + } + + private void processDeletedRowsTask( + SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) + throws IOException { + DeleteFilter existingDeletesFilter = + ReadUtils.genericDeleteFilter( + table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes()); + DeleteReader newDeletesReader = + ReadUtils.genericDeleteReader( + table, scanConfig, task.getDataFile().getPath(), task.getAddedDeletes()); + + try (CloseableIterable allRecords = ReadUtils.createReader(task, table, scanConfig)) { + CloseableIterable liveRecords = existingDeletesFilter.filter(allRecords); + CloseableIterable newlyDeletedRecords = newDeletesReader.read(liveRecords); + + for (Record rec : newlyDeletedRecords) { + // TODO: output with DELETE kind + outputRecord( + rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_DELETES); + } + } + numDeletedRowsScanTasksCompleted.inc(); + } + + private void processDeletedFileTask( + SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) + throws IOException { + try (CloseableIterable fullIterable = ReadUtils.createReader(task, table, scanConfig)) { + DeleteFilter deleteFilter = + ReadUtils.genericDeleteFilter( + table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes()); + CloseableIterable filtered = deleteFilter.filter(fullIterable); + for (Record rec : filtered) { + // TODO: output with DELETE kind + outputRecord( + rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_DELETES); + } + } + numDeletedDataFileScanTasksCompleted.inc(); + } + + private void outputRecord( + Record rec, + MultiOutputReceiver outputReceiver, + int ordinal, + long timestampMillis, + TupleTag> keyedTag) { + Row row = IcebergUtils.icebergRecordToBeamRow(beamRowSchema, rec); + Instant timestamp = Instant.ofEpochMilli(timestampMillis); + if (keyedOutput) { // slow path + StructProjection recId = recordIdProjection.wrap(rec); + Row id = structToBeamRow(ordinal, recId, recordIdSchema, rowIdWithOrdinalBeamSchema); + outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); + } else { // fast path + System.out.printf("[UNIFORM] -- Output(%s, %s)\n%s%n", ordinal, timestamp, row); + outputReceiver.get(UNIFORM_ROWS).outputWithTimestamp(row, timestamp); + } + } + + public static Row structToBeamRow( + int ordinal, StructLike struct, org.apache.iceberg.Schema schema, Schema beamSchema) { + ImmutableMap.Builder values = ImmutableMap.builder(); + List columns = schema.columns(); + for (Types.NestedField column : columns) { + String name = column.name(); + Object value = schema.accessorForField(column.fieldId()).get(struct); + values.put(name, value); + } + // include ordinal as part of the row ID to ensure we are comparing rows within the same + // operation + values.put(ORDINAL_FIELD, ordinal); + return Row.withSchema(beamSchema).withFieldValues(values.build()).build(); + } + + @GetSize + public double getSize( + @Element KV> element, + @Restriction OffsetRange restriction) { + // TODO(ahmedabu98): this is just the compressed byte size. find a way to make a better estimate + long size = 0; + + for (long l = restriction.getFrom(); l < restriction.getTo(); l++) { + SerializableChangelogTask task = element.getValue().get((int) l); + size += task.getDataFile().getFileSizeInBytes(); + size += + task.getAddedDeletes().stream() + .mapToLong(SerializableDeleteFile::getFileSizeInBytes) + .sum(); + size += + task.getExistingDeletes().stream() + .mapToLong(SerializableDeleteFile::getFileSizeInBytes) + .sum(); + } + + return size; + } + + @GetInitialRestriction + public OffsetRange getInitialRange( + @Element KV> element) { + return new OffsetRange(0, element.getValue().size()); + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java new file mode 100644 index 000000000000..0419538a23f5 --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.cdc; + +import java.util.Iterator; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.join.CoGbkResult; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TimestampedValue; +import org.apache.beam.sdk.values.TupleTag; +import org.joda.time.Instant; + +public class ReconcileChanges extends DoFn, Row> { + public static final TupleTag> DELETES = new TupleTag<>() {}; + public static final TupleTag> INSERTS = new TupleTag<>() {}; + + @DoFn.ProcessElement + public void processElement( + @Element KV element, + @Timestamp Instant timestamp, + OutputReceiver out) { + CoGbkResult result = element.getValue(); + + // iterables are lazy-loaded from the shuffle service + Iterable> deletes = result.getAll(DELETES); + Iterable> inserts = result.getAll(INSERTS); + + boolean hasDeletes = deletes.iterator().hasNext(); + boolean hasInserts = inserts.iterator().hasNext(); + + if (hasInserts && hasDeletes) { + // UPDATE: row ID exists in both streams + // emit all deletes as 'UPDATE_BEFORE', and all inserts as 'UPDATE_AFTER' + // emit extra inserts as 'UPDATE_AFTER' + // ignore extra deletes (TODO: double check this decision) + Iterator> deletesIterator = deletes.iterator(); + Iterator> insertsIterator = inserts.iterator(); + while (deletesIterator.hasNext() && insertsIterator.hasNext()) { + // TODO: output as UPDATE_BEFORE kind + TimestampedValue updateBefore = deletesIterator.next(); + out.outputWithTimestamp(updateBefore.getValue(), updateBefore.getTimestamp()); + System.out.printf("[MIXED] -- UpdateBefore\n%s\n", updateBefore); + + // TODO: output as UPDATE_AFTER kind + TimestampedValue updateAfter = insertsIterator.next(); + out.outputWithTimestamp(updateAfter.getValue(), updateAfter.getTimestamp()); + System.out.printf("[MIXED] -- UpdateAfter\n%s\n", updateAfter); + } + while (insertsIterator.hasNext()) { + // TODO: output as UPDATE_AFTER kind + TimestampedValue insert = insertsIterator.next(); + out.outputWithTimestamp(insert.getValue(), insert.getTimestamp()); + System.out.printf("[MIXED] -- Added(extra)\n%s\n", insert); + } + } else if (hasInserts) { + // INSERT only + for (TimestampedValue rec : inserts) { + System.out.printf("[MIXED] -- Added\n%s\n", rec); + out.outputWithTimestamp(rec.getValue(), rec.getTimestamp()); + } + } else if (hasDeletes) { + // DELETE only + for (TimestampedValue rec : deletes) { + // TODO: output as DELETE kind + System.out.printf("[MIXED] -- Deleted\n%s\n", rec); + out.outputWithTimestamp(rec.getValue(), rec.getTimestamp()); + } + } + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java new file mode 100644 index 000000000000..1f0aae99a1ad --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.cdc; + +import static com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.beam.sdk.io.iceberg.SerializableDataFile; +import org.apache.beam.sdk.io.iceberg.SerializableDeleteFile; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.NoSuchSchemaException; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.schemas.SchemaRegistry; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaIgnore; +import org.apache.iceberg.AddedRowsScanTask; +import org.apache.iceberg.ChangelogOperation; +import org.apache.iceberg.ChangelogScanTask; +import org.apache.iceberg.ContentScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.DeletedDataFileScanTask; +import org.apache.iceberg.DeletedRowsScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.ExpressionParser; + +@DefaultSchema(AutoValueSchema.class) +@AutoValue +public abstract class SerializableChangelogTask { + public enum Type { + ADDED_ROWS, + DELETED_ROWS, + DELETED_FILE + } + + public static SchemaCoder coder() { + try { + return SchemaRegistry.createDefault().getSchemaCoder(SerializableChangelogTask.class); + } catch (NoSuchSchemaException e) { + throw new RuntimeException(e); + } + } + + public static SerializableChangelogTask.Builder builder() { + return new AutoValue_SerializableChangelogTask.Builder() + .setExistingDeletes(Collections.emptyList()) + .setAddedDeletes(Collections.emptyList()); + } + + public abstract Type getType(); + + public abstract SerializableDataFile getDataFile(); + + public abstract List getExistingDeletes(); + + public abstract List getAddedDeletes(); + + public abstract int getSpecId(); + + public abstract ChangelogOperation getOperation(); + + public abstract int getOrdinal(); + + public abstract long getCommitSnapshotId(); + + public abstract long getStart(); + + public abstract long getLength(); + + public abstract String getJsonExpression(); + + public abstract long getTimestampMillis(); + + @SchemaIgnore + public Expression getExpression(Schema schema) { + return ExpressionParser.fromJson(getJsonExpression(), schema); + } + + @AutoValue.Builder + public abstract static class Builder { + abstract Builder setType(Type type); + + abstract Builder setDataFile(SerializableDataFile dataFile); + + @SchemaIgnore + public Builder setDataFile(DataFile df, String partitionPath) { + return setDataFile(SerializableDataFile.from(df, partitionPath)); + } + + abstract Builder setExistingDeletes(List existingDeletes); + + abstract Builder setAddedDeletes(List addedDeletes); + + abstract Builder setSpecId(int specId); + + abstract Builder setOperation(ChangelogOperation operation); + + abstract Builder setOrdinal(int ordinal); + + abstract Builder setCommitSnapshotId(long commitSnapshotId); + + abstract Builder setStart(long start); + + abstract Builder setLength(long length); + + abstract Builder setJsonExpression(String expression); + + abstract Builder setTimestampMillis(long timestampMillis); + + abstract SerializableChangelogTask build(); + } + + @SuppressWarnings("nullness") + public static SerializableChangelogTask from(ChangelogScanTask task, long timestampMillis) { + checkState( + task instanceof ContentScanTask, "Expected ChangelogScanTask to also be a ContentScanTask"); + ContentScanTask contentScanTask = (ContentScanTask) task; + PartitionSpec spec = contentScanTask.spec(); + SerializableChangelogTask.Builder builder = + SerializableChangelogTask.builder() + .setOperation(task.operation()) + .setOrdinal(task.changeOrdinal()) + .setCommitSnapshotId(task.commitSnapshotId()) + .setDataFile(contentScanTask.file(), spec.partitionToPath(contentScanTask.partition())) + .setSpecId(spec.specId()) + .setStart(contentScanTask.start()) + .setLength(contentScanTask.length()) + .setJsonExpression(ExpressionParser.toJson(contentScanTask.residual())) + .setTimestampMillis(timestampMillis); + + if (task instanceof AddedRowsScanTask) { + AddedRowsScanTask addedRowsTask = (AddedRowsScanTask) task; + builder = + builder + .setType(Type.ADDED_ROWS) + .setAddedDeletes(toSerializableDeletes(addedRowsTask.deletes(), spec)); + } else if (task instanceof DeletedRowsScanTask) { + DeletedRowsScanTask deletedRowsTask = (DeletedRowsScanTask) task; + builder = + builder + .setType(Type.DELETED_ROWS) + .setAddedDeletes(toSerializableDeletes(deletedRowsTask.addedDeletes(), spec)) + .setExistingDeletes(toSerializableDeletes(deletedRowsTask.existingDeletes(), spec)); + } else if (task instanceof DeletedDataFileScanTask) { + DeletedDataFileScanTask deletedFileTask = (DeletedDataFileScanTask) task; + builder = + builder + .setType(Type.DELETED_FILE) + .setExistingDeletes(toSerializableDeletes(deletedFileTask.existingDeletes(), spec)); + } else { + throw new IllegalStateException("Unknown ChangelogScanTask type: " + task.getClass()); + } + return builder.build(); + } + + private static List toSerializableDeletes( + List dfs, PartitionSpec spec) { + return dfs.stream() + .map(df -> SerializableDeleteFile.from(df, spec.partitionToPath(df.partition()))) + .collect(Collectors.toList()); + } + + public static Comparator comparator() { + return (task1, task2) -> { + int ordinalCompare = Integer.compare(task1.getOrdinal(), task2.getOrdinal()); + if (ordinalCompare != 0) { + return ordinalCompare; + } + + int op1Weight = getOperationWeight(task1.getOperation()); + int op2Weight = getOperationWeight(task2.getOperation()); + + return Integer.compare(op1Weight, op2Weight); + }; + } + + private static int getOperationWeight(ChangelogOperation op) { + switch (op) { + case DELETE: + case UPDATE_BEFORE: + return 0; + case INSERT: + case UPDATE_AFTER: + return 1; + default: + throw new UnsupportedOperationException("Unknown ChangelogOperation: " + op); + } + } +} From eb2382818c575ba68ebde030ba4d14bf55ba15f3 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Fri, 26 Dec 2025 13:03:35 -0500 Subject: [PATCH 2/9] comments --- .../beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java | 6 ++++-- .../beam/sdk/io/iceberg/cdc/ReconcileChanges.java | 6 +++--- .../io/iceberg/cdc/SerializableChangelogTask.java | 13 +++++++++++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java index 7e4976747c4e..59d465b42aa2 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java @@ -225,6 +225,7 @@ private void outputRecord( Instant timestamp = Instant.ofEpochMilli(timestampMillis); if (keyedOutput) { // slow path StructProjection recId = recordIdProjection.wrap(rec); + // Create a Row ID consisting of record ID columns and the changelog task's ordinal # Row id = structToBeamRow(ordinal, recId, recordIdSchema, rowIdWithOrdinalBeamSchema); outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); } else { // fast path @@ -242,8 +243,9 @@ public static Row structToBeamRow( Object value = schema.accessorForField(column.fieldId()).get(struct); values.put(name, value); } - // include ordinal as part of the row ID to ensure we are comparing rows within the same - // operation + // Include ordinal as part of the row ID. + // This is essential to ensure that the downstream ReconcileChanges compares rows + // within the same operation. values.put(ORDINAL_FIELD, ordinal); return Row.withSchema(beamSchema).withFieldValues(values.build()).build(); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java index 0419538a23f5..eebe18f44a8e 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java @@ -46,9 +46,9 @@ public void processElement( if (hasInserts && hasDeletes) { // UPDATE: row ID exists in both streams - // emit all deletes as 'UPDATE_BEFORE', and all inserts as 'UPDATE_AFTER' - // emit extra inserts as 'UPDATE_AFTER' - // ignore extra deletes (TODO: double check this decision) + // - emit all deletes as 'UPDATE_BEFORE', and all inserts as 'UPDATE_AFTER' + // - emit extra inserts as 'UPDATE_AFTER' + // - ignore extra deletes (TODO: double check if this is a good decision) Iterator> deletesIterator = deletes.iterator(); Iterator> insertsIterator = inserts.iterator(); while (deletesIterator.hasNext() && insertsIterator.hasNext()) { diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java index 1f0aae99a1ad..6d5920ae5a7e 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java @@ -31,6 +31,7 @@ import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldNumber; import org.apache.beam.sdk.schemas.annotations.SchemaIgnore; import org.apache.iceberg.AddedRowsScanTask; import org.apache.iceberg.ChangelogOperation; @@ -68,28 +69,40 @@ public static SerializableChangelogTask.Builder builder() { .setAddedDeletes(Collections.emptyList()); } + @SchemaFieldNumber("0") public abstract Type getType(); + @SchemaFieldNumber("1") public abstract SerializableDataFile getDataFile(); + @SchemaFieldNumber("2") public abstract List getExistingDeletes(); + @SchemaFieldNumber("3") public abstract List getAddedDeletes(); + @SchemaFieldNumber("4") public abstract int getSpecId(); + @SchemaFieldNumber("5") public abstract ChangelogOperation getOperation(); + @SchemaFieldNumber("6") public abstract int getOrdinal(); + @SchemaFieldNumber("7") public abstract long getCommitSnapshotId(); + @SchemaFieldNumber("8") public abstract long getStart(); + @SchemaFieldNumber("9") public abstract long getLength(); + @SchemaFieldNumber("10") public abstract String getJsonExpression(); + @SchemaFieldNumber("11") public abstract long getTimestampMillis(); @SchemaIgnore From 4e81f78e1ead25bd1d9250c4b21c194ef5b909ae Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sun, 28 Dec 2025 03:41:20 -0500 Subject: [PATCH 3/9] naming changes; key on snapshot ID instead of ordinal ID --- .../sdk/io/iceberg/cdc/ChangelogScanner.java | 27 +++---- .../cdc/IncrementalChangelogSource.java | 57 ++++++++------- .../io/iceberg/cdc/ReadFromChangelogs.java | 72 +++++++++++-------- .../sdk/io/iceberg/cdc/ReconcileChanges.java | 1 + 4 files changed, 92 insertions(+), 65 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java index 5e9583ded83d..d9c59313d58a 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java @@ -61,9 +61,9 @@ public class ChangelogScanner private static final Counter numDeletedDataFileScanTasks = Metrics.counter(ChangelogScanner.class, "numDeletedDataFileScanTasks"); public static final TupleTag>> - UNIFORM_CHANGES = new TupleTag<>(); + UNIDIRECTIONAL_CHANGES = new TupleTag<>(); public static final TupleTag>> - MIXED_CHANGES = new TupleTag<>(); + BIDIRECTIONAL_CHANGES = new TupleTag<>(); public static final KvCoder> OUTPUT_CODER = KvCoder.of(ChangelogDescriptor.coder(), ListCoder.of(SerializableChangelogTask.coder())); private final IcebergScanConfig scanConfig; @@ -117,7 +117,7 @@ private void createAndOutputReadTasks( Map cachedSnapshotTimestamps = new HashMap<>(); // Maintain the same scan task groupings produced by Iceberg's binpacking, for // better work load distribution among readers. - // Also allows the user to control by setting a `read.split.target-size`: + // This allows the user to control load per worker by tuning `read.split.target-size`: // https://iceberg.apache.org/docs/latest/configuration/#read-properties Map>> changelogScanTaskGroups = new HashMap<>(); @@ -126,7 +126,7 @@ private void createAndOutputReadTasks( try (CloseableIterable> scanTaskGroups = scan.planTasks()) { for (ScanTaskGroup scanTaskGroup : scanTaskGroups) { - Map> ordinalGroups = new HashMap<>(); + Map> ordinalTaskGroup = new HashMap<>(); for (ChangelogScanTask changelogScanTask : scanTaskGroup.tasks()) { long snapshotId = changelogScanTask.commitSnapshotId(); @@ -137,7 +137,7 @@ private void createAndOutputReadTasks( SerializableChangelogTask task = SerializableChangelogTask.from(changelogScanTask, timestampMillis); - ordinalGroups.computeIfAbsent(ordinal, (unused) -> new ArrayList<>()).add(task); + ordinalTaskGroup.computeIfAbsent(ordinal, (o) -> new ArrayList<>()).add(task); changeTypesPerOrdinal .computeIfAbsent(ordinal, (o) -> new HashSet<>()) @@ -158,7 +158,7 @@ private void createAndOutputReadTasks( } for (Map.Entry> ordinalGroup : - ordinalGroups.entrySet()) { + ordinalTaskGroup.entrySet()) { changelogScanTaskGroups .computeIfAbsent(ordinalGroup.getKey(), (unused) -> new ArrayList<>()) .add(ordinalGroup.getValue()); @@ -198,17 +198,20 @@ private void createAndOutputReadTasks( KV.of(descriptor, subgroup); // Determine where each ordinal's tasks will go, based on the type of changes: - // 1. If an ordinal's changes are uniform (i.e. all inserts or all deletes), they should be + // 1. If an ordinal's changes are unidirectional (i.e. only inserts or only deletes), they + // should be // processed directly in the fast path. - // 2. If an ordinal's changes are mixed (i.e. some inserts and some deletes), they will need + // 2. If an ordinal's changes are bidirectional (i.e. both inserts and deletes), they will + // need // more careful processing to determine if any updates have occurred. Set changeTypes = checkStateNotNull(changeTypesPerOrdinal.get(ordinal)); TupleTag>> outputTag; - if (changeTypes.contains(ADDED_ROWS) && changeTypes.size() > 1) { // added and deleted rows - outputTag = MIXED_CHANGES; - } else { // all added or all deleted rows - outputTag = UNIFORM_CHANGES; + if (changeTypes.contains(ADDED_ROWS) + && changeTypes.size() > 1) { // both added and deleted rows + outputTag = BIDIRECTIONAL_CHANGES; + } else { // only added or only deleted rows + outputTag = UNIDIRECTIONAL_CHANGES; } multiOutputReceiver.get(outputTag).outputWithTimestamp(output, timestamp); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java index cfe60631158c..f58893adb946 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java @@ -17,8 +17,8 @@ */ package org.apache.beam.sdk.io.iceberg.cdc; -import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.MIXED_CHANGES; -import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIFORM_CHANGES; +import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.BIDIRECTIONAL_CHANGES; +import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIDIRECTIONAL_CHANGES; import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_DELETES; import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_INSERTS; import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.UNIFORM_ROWS; @@ -77,25 +77,28 @@ public PCollection expand(PBegin input) { .apply( "Create Changelog Tasks", ParDo.of(new ChangelogScanner(scanConfig)) - .withOutputTags(UNIFORM_CHANGES, TupleTagList.of(MIXED_CHANGES))); + .withOutputTags( + UNIDIRECTIONAL_CHANGES, TupleTagList.of(BIDIRECTIONAL_CHANGES))); // for changelog ordinal groups that have UNIFORM changes (i.e. all deletes, or all inserts), // take the fast approach of just reading and emitting CDC records. - PCollection fastPathCdcRows = - processUniformChanges( - changelogTasks.get(UNIFORM_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); + PCollection uniDirectionalCdcRows = + processUniDirectionalChanges( + changelogTasks.get(UNIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); - // changelog ordinal groups that have MIXED changes (i.e. some deletes and some inserts) - // will need extra processing to identify any updates - PCollection slowPathCdcRows = - processMixedChanges( - changelogTasks.get(MIXED_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); + // changelog ordinal groups that have BIDIRECTIONAL changes (i.e. both deletes and inserts) + // will need extra processing (including a shuffle) to identify any updates + PCollection largeBiDirectionalCdcRows = + processLargeBiDirectionalChanges( + changelogTasks.get(BIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); - // Merge UNIFORM and MIXED outputs - return PCollectionList.of(fastPathCdcRows).and(slowPathCdcRows).apply(Flatten.pCollections()); + // Merge UNIDIRECTIONAL and BIDIRECTIONAL outputs + return PCollectionList.of(uniDirectionalCdcRows) + .and(largeBiDirectionalCdcRows) + .apply(Flatten.pCollections()); } - private PCollection processUniformChanges( + private PCollection processUniDirectionalChanges( PCollection>> uniformChangelogs) { return uniformChangelogs .apply(Redistribute.arbitrarily()) @@ -107,41 +110,47 @@ private PCollection processUniformChanges( .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); } - private PCollection processMixedChanges( - PCollection>> mixedChangelogs) { - PCollectionTuple mixedCdcKeyedRows = - mixedChangelogs + private PCollection processLargeBiDirectionalChanges( + PCollection>> + biDirectionalChangelogs) { + PCollectionTuple biDirectionalKeyedRows = + biDirectionalChangelogs .apply(Redistribute.arbitrarily()) .apply( - "Read Mixed Changes", + "Read Large BiDirectional Changes", ParDo.of(ReadFromChangelogs.withKeyedOutput(scanConfig)) .withOutputTags(KEYED_INSERTS, TupleTagList.of(KEYED_DELETES))); // prior to CoGBK, set a windowing strategy to maintain the earliest timestamp in the window + // this allows us to emit records downstream that may have larger reified timestamps Window>> windowingStrategy = Window.>>into(new GlobalWindows()) .withTimestampCombiner(TimestampCombiner.EARLIEST); // preserve the element's timestamp by moving it into the value + // in the normal case, this will be a no-op because all CDC rows in an ordinal have the same + // commit timestamp. + // but this will matter if we add custom watermarking, where record timestamps are + // derived from a specified column KvCoder keyedOutputCoder = ReadFromChangelogs.keyedOutputCoder(scanConfig); PCollection>> keyedInsertsWithTimestamps = - mixedCdcKeyedRows + biDirectionalKeyedRows .get(KEYED_INSERTS) .setCoder(keyedOutputCoder) - .apply(Reify.timestampsInValue()) + .apply("Reify INSERT Timestamps", Reify.timestampsInValue()) .apply(windowingStrategy); PCollection>> keyedDeletesWithTimestamps = - mixedCdcKeyedRows + biDirectionalKeyedRows .get(KEYED_DELETES) .setCoder(keyedOutputCoder) - .apply(Reify.timestampsInValue()) + .apply("Reify DELETE Timestamps", Reify.timestampsInValue()) .apply(windowingStrategy); // CoGroup by record ID and emit any (DELETE + INSERT) pairs as updates: (UPDATE_BEFORE, // UPDATE_AFTER) return KeyedPCollectionTuple.of(INSERTS, keyedInsertsWithTimestamps) .and(DELETES, keyedDeletesWithTimestamps) - .apply(CoGroupByKey.create()) + .apply("CoGroupBy Row ID", CoGroupByKey.create()) .apply("Reconcile Inserts and Deletes", ParDo.of(new ReconcileChanges())) .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java index 59d465b42aa2..4e202c7155f7 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java @@ -66,8 +66,8 @@ public class ReadFromChangelogs private transient StructProjection recordIdProjection; private transient org.apache.iceberg.Schema recordIdSchema; private final Schema beamRowSchema; - private final Schema rowIdWithOrdinalBeamSchema; - private static final String ORDINAL_FIELD = "__beam__changelog__ordinal__"; + private final Schema rowAndSnapshotIDBeamSchema; + private static final String SNAPSHOT_FIELD = "__beam__changelog__snapshot__id__"; private ReadFromChangelogs(IcebergScanConfig scanConfig, boolean keyedOutput) { this.scanConfig = scanConfig; @@ -78,13 +78,7 @@ private ReadFromChangelogs(IcebergScanConfig scanConfig, boolean keyedOutput) { this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); - Schema rowIdBeamSchema = icebergSchemaToBeamSchema(recordIdSchema); - List fields = - ImmutableList.builder() - .add(Schema.Field.of(ORDINAL_FIELD, Schema.FieldType.INT32)) - .addAll(rowIdBeamSchema.getFields()) - .build(); - this.rowIdWithOrdinalBeamSchema = new Schema(fields); + this.rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig); } static ReadFromChangelogs of(IcebergScanConfig scanConfig) { @@ -100,19 +94,24 @@ static ReadFromChangelogs> withKeyedOutput(IcebergScanConfig scanCo * schema's identifier fields. */ static KvCoder keyedOutputCoder(IcebergScanConfig scanConfig) { + org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); + Schema rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig); + return KvCoder.of( + SchemaCoder.of(rowAndSnapshotIDBeamSchema), + SchemaCoder.of(icebergSchemaToBeamSchema(recordSchema))); + } + + private static Schema rowAndSnapshotIDBeamSchema(IcebergScanConfig scanConfig) { org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); org.apache.iceberg.Schema recordIdSchema = - recordSchema.select(recordSchema.identifierFieldNames()); + recordSchema.select(recordSchema.identifierFieldNames()); Schema rowIdBeamSchema = icebergSchemaToBeamSchema(recordIdSchema); List fields = - ImmutableList.builder() - .add(Schema.Field.of(ORDINAL_FIELD, Schema.FieldType.INT32)) - .addAll(rowIdBeamSchema.getFields()) - .build(); - Schema rowIdWithOrdinalBeamSchema = new Schema(fields); - return KvCoder.of( - SchemaCoder.of(rowIdWithOrdinalBeamSchema), - SchemaCoder.of(icebergSchemaToBeamSchema(recordSchema))); + ImmutableList.builder() + .add(Schema.Field.of(SNAPSHOT_FIELD, Schema.FieldType.INT64)) + .addAll(rowIdBeamSchema.getFields()) + .build(); + return new Schema(fields); } @Setup @@ -164,12 +163,16 @@ private void processAddedRowsTask( try (CloseableIterable fullIterable = ReadUtils.createReader(task, table, scanConfig)) { DeleteFilter deleteFilter = ReadUtils.genericDeleteFilter( - table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes()); + table, scanConfig, task.getDataFile().getPath(), task.getAddedDeletes()); CloseableIterable filtered = deleteFilter.filter(fullIterable); for (Record rec : filtered) { outputRecord( - rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_INSERTS); + rec, + outputReceiver, + task.getCommitSnapshotId(), + task.getTimestampMillis(), + KEYED_INSERTS); } } numAddedRowsScanTasksCompleted.inc(); @@ -192,7 +195,11 @@ private void processDeletedRowsTask( for (Record rec : newlyDeletedRecords) { // TODO: output with DELETE kind outputRecord( - rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_DELETES); + rec, + outputReceiver, + task.getCommitSnapshotId(), + task.getTimestampMillis(), + KEYED_DELETES); } } numDeletedRowsScanTasksCompleted.inc(); @@ -209,7 +216,11 @@ private void processDeletedFileTask( for (Record rec : filtered) { // TODO: output with DELETE kind outputRecord( - rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_DELETES); + rec, + outputReceiver, + task.getCommitSnapshotId(), + task.getTimestampMillis(), + KEYED_DELETES); } } numDeletedDataFileScanTasksCompleted.inc(); @@ -218,24 +229,27 @@ private void processDeletedFileTask( private void outputRecord( Record rec, MultiOutputReceiver outputReceiver, - int ordinal, + long snapshotId, long timestampMillis, TupleTag> keyedTag) { Row row = IcebergUtils.icebergRecordToBeamRow(beamRowSchema, rec); Instant timestamp = Instant.ofEpochMilli(timestampMillis); if (keyedOutput) { // slow path StructProjection recId = recordIdProjection.wrap(rec); - // Create a Row ID consisting of record ID columns and the changelog task's ordinal # - Row id = structToBeamRow(ordinal, recId, recordIdSchema, rowIdWithOrdinalBeamSchema); + // Create a Row ID consisting of: + // 1. the task's commit snapshot ID + // 2. the record ID column values + // This is needed to sufficiently distinguish a record change + Row id = structToBeamRow(snapshotId, recId, recordIdSchema, rowAndSnapshotIDBeamSchema); outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); } else { // fast path - System.out.printf("[UNIFORM] -- Output(%s, %s)\n%s%n", ordinal, timestamp, row); + System.out.printf("[UNIFORM] -- Output(%s, %s)\n%s%n", snapshotId, timestamp, row); outputReceiver.get(UNIFORM_ROWS).outputWithTimestamp(row, timestamp); } } public static Row structToBeamRow( - int ordinal, StructLike struct, org.apache.iceberg.Schema schema, Schema beamSchema) { + long snapshotId, StructLike struct, org.apache.iceberg.Schema schema, Schema beamSchema) { ImmutableMap.Builder values = ImmutableMap.builder(); List columns = schema.columns(); for (Types.NestedField column : columns) { @@ -243,10 +257,10 @@ public static Row structToBeamRow( Object value = schema.accessorForField(column.fieldId()).get(struct); values.put(name, value); } - // Include ordinal as part of the row ID. + // Include snapshot ID as part of the row ID. // This is essential to ensure that the downstream ReconcileChanges compares rows // within the same operation. - values.put(ORDINAL_FIELD, ordinal); + values.put(SNAPSHOT_FIELD, snapshotId); return Row.withSchema(beamSchema).withFieldValues(values.build()).build(); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java index eebe18f44a8e..4b0562c2b1c2 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java @@ -36,6 +36,7 @@ public void processElement( @Timestamp Instant timestamp, OutputReceiver out) { CoGbkResult result = element.getValue(); + System.out.println("xxx [MIXED] Process timestamp: " + timestamp); // iterables are lazy-loaded from the shuffle service Iterable> deletes = result.getAll(DELETES); From ead532f93e1f20301de88a3a94825bbea5472235 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sun, 28 Dec 2025 04:04:20 -0500 Subject: [PATCH 4/9] cleanup --- .../sdk/io/iceberg/SerializableDataFile.java | 18 +++-- .../io/iceberg/SerializableDeleteFile.java | 67 ++----------------- .../cdc/IncrementalChangelogSource.java | 26 +++---- .../io/iceberg/cdc/ReadFromChangelogs.java | 16 ++--- 4 files changed, 38 insertions(+), 89 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java index 2db173d75947..70f507254a0e 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java @@ -217,7 +217,7 @@ DataFile createDataFile(Map partitionSpecs) { // ByteBuddyUtils has trouble converting Map value type ByteBuffer // to byte[] and back to ByteBuffer, so we perform these conversions manually // TODO(https://github.com/apache/beam/issues/32701) - private static @Nullable Map toByteArrayMap( + static @Nullable Map toByteArrayMap( @Nullable Map input) { if (input == null) { return null; @@ -229,7 +229,7 @@ DataFile createDataFile(Map partitionSpecs) { return output; } - private static @Nullable Map toByteBufferMap( + static @Nullable Map toByteBufferMap( @Nullable Map input) { if (input == null) { return null; @@ -263,10 +263,13 @@ && getPartitionSpecId() == that.getPartitionSpecId() && Objects.equals(getNullValueCounts(), that.getNullValueCounts()) && Objects.equals(getNanValueCounts(), that.getNanValueCounts()) && mapEquals(getLowerBounds(), that.getLowerBounds()) - && mapEquals(getUpperBounds(), that.getUpperBounds()); + && mapEquals(getUpperBounds(), that.getUpperBounds()) + && Objects.equals(getDataSequenceNumber(), that.getDataSequenceNumber()) + && Objects.equals(getFileSequenceNumber(), that.getFileSequenceNumber()) + && Objects.equals(getFirstRowId(), that.getFirstRowId()); } - private static boolean mapEquals( + static boolean mapEquals( @Nullable Map map1, @Nullable Map map2) { if (map1 == null && map2 == null) { return true; @@ -304,13 +307,16 @@ public final int hashCode() { getColumnSizes(), getValueCounts(), getNullValueCounts(), - getNanValueCounts()); + getNanValueCounts(), + getDataSequenceNumber(), + getFileSequenceNumber(), + getFirstRowId()); hashCode = 31 * hashCode + computeMapByteHashCode(getLowerBounds()); hashCode = 31 * hashCode + computeMapByteHashCode(getUpperBounds()); return hashCode; } - private static int computeMapByteHashCode(@Nullable Map map) { + static int computeMapByteHashCode(@Nullable Map map) { if (map == null) { return 0; } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java index 9653f977805d..1c4ddf7e21ce 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java @@ -17,6 +17,10 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.SerializableDataFile.computeMapByteHashCode; +import static org.apache.beam.sdk.io.iceberg.SerializableDataFile.mapEquals; +import static org.apache.beam.sdk.io.iceberg.SerializableDataFile.toByteArrayMap; +import static org.apache.beam.sdk.io.iceberg.SerializableDataFile.toByteBufferMap; import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import com.google.auto.value.AutoValue; @@ -260,33 +264,6 @@ public DeleteFile createDeleteFile( return deleteFileBuilder.build(); } - // ByteBuddyUtils has trouble converting Map value type ByteBuffer - // to byte[] and back to ByteBuffer, so we perform these conversions manually - // TODO(https://github.com/apache/beam/issues/32701) - private static @Nullable Map toByteArrayMap( - @Nullable Map input) { - if (input == null) { - return null; - } - Map output = new HashMap<>(input.size()); - for (Map.Entry e : input.entrySet()) { - output.put(e.getKey(), e.getValue().array()); - } - return output; - } - - private static @Nullable Map toByteBufferMap( - @Nullable Map input) { - if (input == null) { - return null; - } - Map output = new HashMap<>(input.size()); - for (Map.Entry e : input.entrySet()) { - output.put(e.getKey(), ByteBuffer.wrap(e.getValue())); - } - return output; - } - @Override public final boolean equals(@Nullable Object o) { if (this == o) { @@ -320,29 +297,6 @@ && mapEquals(getUpperBounds(), that.getUpperBounds()) && Objects.equals(getFileSequenceNumber(), that.getFileSequenceNumber()); } - private static boolean mapEquals( - @Nullable Map map1, @Nullable Map map2) { - if (map1 == null && map2 == null) { - return true; - } else if (map1 == null || map2 == null) { - return false; - } - Equivalence byteArrayEquivalence = - new Equivalence() { - @Override - protected boolean doEquivalent(byte[] a, byte[] b) { - return Arrays.equals(a, b); - } - - @Override - protected int doHash(byte[] bytes) { - return Arrays.hashCode(bytes); - } - }; - - return Maps.difference(map1, map2, byteArrayEquivalence).areEqual(); - } - @Override public final int hashCode() { int hashCode = @@ -371,17 +325,4 @@ public final int hashCode() { hashCode = 31 * hashCode + computeMapByteHashCode(getUpperBounds()); return hashCode; } - - private static int computeMapByteHashCode(@Nullable Map map) { - if (map == null) { - return 0; - } - int hashCode = 0; - for (Map.Entry entry : map.entrySet()) { - int keyHash = entry.getKey().hashCode(); - int valueHash = Arrays.hashCode(entry.getValue()); // content-based hash code - hashCode += keyHash ^ valueHash; - } - return hashCode; - } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java index f58893adb946..17bc97926a62 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java @@ -21,7 +21,7 @@ import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIDIRECTIONAL_CHANGES; import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_DELETES; import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_INSERTS; -import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.UNIFORM_ROWS; +import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.UNIDIRECTIONAL_ROWS; import static org.apache.beam.sdk.io.iceberg.cdc.ReconcileChanges.DELETES; import static org.apache.beam.sdk.io.iceberg.cdc.ReconcileChanges.INSERTS; @@ -80,7 +80,8 @@ public PCollection expand(PBegin input) { .withOutputTags( UNIDIRECTIONAL_CHANGES, TupleTagList.of(BIDIRECTIONAL_CHANGES))); - // for changelog ordinal groups that have UNIFORM changes (i.e. all deletes, or all inserts), + // for changelog ordinal groups that have UNIDIRECTIONAL changes (i.e. all deletes, or all + // inserts), // take the fast approach of just reading and emitting CDC records. PCollection uniDirectionalCdcRows = processUniDirectionalChanges( @@ -88,36 +89,37 @@ public PCollection expand(PBegin input) { // changelog ordinal groups that have BIDIRECTIONAL changes (i.e. both deletes and inserts) // will need extra processing (including a shuffle) to identify any updates - PCollection largeBiDirectionalCdcRows = - processLargeBiDirectionalChanges( + PCollection biDirectionalCdcRows = + processBiDirectionalChanges( changelogTasks.get(BIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); // Merge UNIDIRECTIONAL and BIDIRECTIONAL outputs return PCollectionList.of(uniDirectionalCdcRows) - .and(largeBiDirectionalCdcRows) + .and(biDirectionalCdcRows) .apply(Flatten.pCollections()); } private PCollection processUniDirectionalChanges( - PCollection>> uniformChangelogs) { - return uniformChangelogs + PCollection>> + uniDirectionalChangelogs) { + return uniDirectionalChangelogs .apply(Redistribute.arbitrarily()) .apply( - "Read Uniform Changes", + "Read UniDirectional Changes", ParDo.of(ReadFromChangelogs.of(scanConfig)) - .withOutputTags(UNIFORM_ROWS, TupleTagList.empty())) - .get(UNIFORM_ROWS) + .withOutputTags(UNIDIRECTIONAL_ROWS, TupleTagList.empty())) + .get(UNIDIRECTIONAL_ROWS) .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); } - private PCollection processLargeBiDirectionalChanges( + private PCollection processBiDirectionalChanges( PCollection>> biDirectionalChangelogs) { PCollectionTuple biDirectionalKeyedRows = biDirectionalChangelogs .apply(Redistribute.arbitrarily()) .apply( - "Read Large BiDirectional Changes", + "Read BiDirectional Changes", ParDo.of(ReadFromChangelogs.withKeyedOutput(scanConfig)) .withOutputTags(KEYED_INSERTS, TupleTagList.of(KEYED_DELETES))); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java index 4e202c7155f7..13eeb0d007da 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java @@ -50,7 +50,7 @@ @DoFn.BoundedPerElement public class ReadFromChangelogs extends DoFn>, OutT> { - public static final TupleTag UNIFORM_ROWS = new TupleTag<>(); + public static final TupleTag UNIDIRECTIONAL_ROWS = new TupleTag<>(); public static final TupleTag> KEYED_INSERTS = new TupleTag<>(); public static final TupleTag> KEYED_DELETES = new TupleTag<>(); @@ -104,13 +104,13 @@ static KvCoder keyedOutputCoder(IcebergScanConfig scanConfig) { private static Schema rowAndSnapshotIDBeamSchema(IcebergScanConfig scanConfig) { org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); org.apache.iceberg.Schema recordIdSchema = - recordSchema.select(recordSchema.identifierFieldNames()); + recordSchema.select(recordSchema.identifierFieldNames()); Schema rowIdBeamSchema = icebergSchemaToBeamSchema(recordIdSchema); List fields = - ImmutableList.builder() - .add(Schema.Field.of(SNAPSHOT_FIELD, Schema.FieldType.INT64)) - .addAll(rowIdBeamSchema.getFields()) - .build(); + ImmutableList.builder() + .add(Schema.Field.of(SNAPSHOT_FIELD, Schema.FieldType.INT64)) + .addAll(rowIdBeamSchema.getFields()) + .build(); return new Schema(fields); } @@ -243,8 +243,8 @@ private void outputRecord( Row id = structToBeamRow(snapshotId, recId, recordIdSchema, rowAndSnapshotIDBeamSchema); outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); } else { // fast path - System.out.printf("[UNIFORM] -- Output(%s, %s)\n%s%n", snapshotId, timestamp, row); - outputReceiver.get(UNIFORM_ROWS).outputWithTimestamp(row, timestamp); + System.out.printf("[UNIDIRECTIONAL] -- Output(%s, %s)\n%s%n", snapshotId, timestamp, row); + outputReceiver.get(UNIDIRECTIONAL_ROWS).outputWithTimestamp(row, timestamp); } } From b9b64aa79bf2403ab41d7d9c6326afc672be0328 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sun, 28 Dec 2025 06:48:26 -0500 Subject: [PATCH 5/9] only consider bidirectional changes WITHIN a partition --- .../sdk/io/iceberg/SerializableDataFile.java | 14 +- .../io/iceberg/SerializableDeleteFile.java | 4 - .../sdk/io/iceberg/cdc/ChangelogScanner.java | 121 ++++++++++++------ .../io/iceberg/cdc/ReadFromChangelogs.java | 6 +- .../sdk/io/iceberg/cdc/ReconcileChanges.java | 11 +- 5 files changed, 95 insertions(+), 61 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java index 70f507254a0e..ba3e4bfb59db 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java @@ -72,7 +72,7 @@ public static Builder builder() { public abstract long getFileSizeInBytes(); @SchemaFieldNumber("4") - abstract String getPartitionPath(); + public abstract String getPartitionPath(); @SchemaFieldNumber("5") abstract int getPartitionSpecId(); @@ -217,8 +217,7 @@ DataFile createDataFile(Map partitionSpecs) { // ByteBuddyUtils has trouble converting Map value type ByteBuffer // to byte[] and back to ByteBuffer, so we perform these conversions manually // TODO(https://github.com/apache/beam/issues/32701) - static @Nullable Map toByteArrayMap( - @Nullable Map input) { + static @Nullable Map toByteArrayMap(@Nullable Map input) { if (input == null) { return null; } @@ -229,8 +228,7 @@ DataFile createDataFile(Map partitionSpecs) { return output; } - static @Nullable Map toByteBufferMap( - @Nullable Map input) { + static @Nullable Map toByteBufferMap(@Nullable Map input) { if (input == null) { return null; } @@ -308,9 +306,9 @@ public final int hashCode() { getValueCounts(), getNullValueCounts(), getNanValueCounts(), - getDataSequenceNumber(), - getFileSequenceNumber(), - getFirstRowId()); + getDataSequenceNumber(), + getFileSequenceNumber(), + getFirstRowId()); hashCode = 31 * hashCode + computeMapByteHashCode(getLowerBounds()); hashCode = 31 * hashCode + computeMapByteHashCode(getUpperBounds()); return hashCode; diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java index 1c4ddf7e21ce..d449403988a7 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDeleteFile.java @@ -26,16 +26,12 @@ import com.google.auto.value.AutoValue; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.annotations.SchemaFieldNumber; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Equivalence; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java index d9c59313d58a..4ae88e837c14 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java @@ -60,6 +60,10 @@ public class ChangelogScanner Metrics.counter(ChangelogScanner.class, "numDeletedRowsScanTasks"); private static final Counter numDeletedDataFileScanTasks = Metrics.counter(ChangelogScanner.class, "numDeletedDataFileScanTasks"); + private static final Counter numUniDirectionalTasks = + Metrics.counter(ChangelogScanner.class, "numUniDirectionalTasks"); + private static final Counter numBiDirectionalTasks = + Metrics.counter(ChangelogScanner.class, "numBiDirectionalTasks"); public static final TupleTag>> UNIDIRECTIONAL_CHANGES = new TupleTag<>(); public static final TupleTag>> @@ -115,14 +119,15 @@ private void createAndOutputReadTasks( int numDeletedFileTasks = 0; Map cachedSnapshotTimestamps = new HashMap<>(); - // Maintain the same scan task groupings produced by Iceberg's binpacking, for + // Best effort maintain the same scan task groupings produced by Iceberg's binpacking, for // better work load distribution among readers. // This allows the user to control load per worker by tuning `read.split.target-size`: // https://iceberg.apache.org/docs/latest/configuration/#read-properties - Map>> changelogScanTaskGroups = new HashMap<>(); + Map>> changelogScanTasks = new HashMap<>(); - // keep track of the types of changes in each ordinal - Map> changeTypesPerOrdinal = new HashMap<>(); + // keep track of the types of changes in each partition. do this for each ordinal + Map>> partitionChangeTypesPerOrdinal = + new HashMap<>(); try (CloseableIterable> scanTaskGroups = scan.planTasks()) { for (ScanTaskGroup scanTaskGroup : scanTaskGroups) { @@ -137,13 +142,9 @@ private void createAndOutputReadTasks( SerializableChangelogTask task = SerializableChangelogTask.from(changelogScanTask, timestampMillis); - ordinalTaskGroup.computeIfAbsent(ordinal, (o) -> new ArrayList<>()).add(task); - - changeTypesPerOrdinal - .computeIfAbsent(ordinal, (o) -> new HashSet<>()) - .add(task.getType()); + String partition = task.getDataFile().getPartitionPath(); - // metric gathering + // gather metrics switch (task.getType()) { case ADDED_ROWS: numAddedRowsTasks++; @@ -155,34 +156,29 @@ private void createAndOutputReadTasks( numDeletedFileTasks++; break; } + + partitionChangeTypesPerOrdinal + .computeIfAbsent(ordinal, (o) -> new HashMap<>()) + .computeIfAbsent(partition, (p) -> new HashSet<>()) + .add(task.getType()); + + ordinalTaskGroup.computeIfAbsent(ordinal, (o) -> new ArrayList<>()).add(task); } for (Map.Entry> ordinalGroup : ordinalTaskGroup.entrySet()) { - changelogScanTaskGroups + changelogScanTasks .computeIfAbsent(ordinalGroup.getKey(), (unused) -> new ArrayList<>()) .add(ordinalGroup.getValue()); } } } - int totalTasks = numAddedRowsTasks + numDeletedRowsTasks + numDeletedFileTasks; - totalChangelogScanTasks.inc(totalTasks); - numAddedRowsScanTasks.inc(numAddedRowsTasks); - numDeletedRowsScanTasks.inc(numDeletedRowsTasks); - numDeletedDataFileScanTasks.inc(numDeletedFileTasks); - - LOG.info( - "Snapshots [{}, {}] produced {} tasks:\n\t{} AddedRowsScanTasks\n\t{} DeletedRowsScanTasks\n\t{} DeletedDataFileScanTasks", - startSnapshot.getSnapshotId(), - endSnapshot.getSnapshotId(), - totalTasks, - numAddedRowsTasks, - numDeletedRowsTasks, - numDeletedFileTasks); + int numUniDirTasks = 0; + int numBiDirTasks = 0; for (Map.Entry>> taskGroups : - changelogScanTaskGroups.entrySet()) { + changelogScanTasks.entrySet()) { int ordinal = taskGroups.getKey(); ChangelogDescriptor descriptor = ChangelogDescriptor.builder() @@ -194,28 +190,69 @@ private void createAndOutputReadTasks( for (List subgroup : taskGroups.getValue()) { Instant timestamp = Instant.ofEpochMilli(subgroup.get(0).getTimestampMillis()); - KV> output = - KV.of(descriptor, subgroup); // Determine where each ordinal's tasks will go, based on the type of changes: // 1. If an ordinal's changes are unidirectional (i.e. only inserts or only deletes), they - // should be - // processed directly in the fast path. - // 2. If an ordinal's changes are bidirectional (i.e. both inserts and deletes), they will - // need - // more careful processing to determine if any updates have occurred. - Set changeTypes = - checkStateNotNull(changeTypesPerOrdinal.get(ordinal)); - TupleTag>> outputTag; - if (changeTypes.contains(ADDED_ROWS) - && changeTypes.size() > 1) { // both added and deleted rows - outputTag = BIDIRECTIONAL_CHANGES; - } else { // only added or only deleted rows - outputTag = UNIDIRECTIONAL_CHANGES; + // should be processed directly in the fast path. + // 2. If an ordinal's changes are bidirectional (i.e. both inserts and deletes) within a + // partition, they will need more careful processing to determine if any updates have + // occurred. + Map> changeTypesPerPartition = + checkStateNotNull(partitionChangeTypesPerOrdinal.get(ordinal)); + List uniDirTasks = new ArrayList<>(); + List biDirTasks = new ArrayList<>(); + for (SerializableChangelogTask task : subgroup) { + Set partitionChangeTypes = + checkStateNotNull(changeTypesPerPartition.get(task.getDataFile().getPartitionPath())); + if (containsBiDirectionalChanges(partitionChangeTypes)) { + biDirTasks.add(task); + } else { + uniDirTasks.add(task); + } } - multiOutputReceiver.get(outputTag).outputWithTimestamp(output, timestamp); + if (!uniDirTasks.isEmpty()) { + KV> uniDirOutput = + KV.of(descriptor, uniDirTasks); + multiOutputReceiver + .get(UNIDIRECTIONAL_CHANGES) + .outputWithTimestamp(uniDirOutput, timestamp); + numUniDirTasks += uniDirTasks.size(); + } + if (!biDirTasks.isEmpty()) { + KV> biDirOutput = + KV.of(descriptor, biDirTasks); + multiOutputReceiver + .get(BIDIRECTIONAL_CHANGES) + .outputWithTimestamp(biDirOutput, timestamp); + numBiDirTasks += biDirTasks.size(); + } } } + + int totalTasks = numAddedRowsTasks + numDeletedRowsTasks + numDeletedFileTasks; + totalChangelogScanTasks.inc(totalTasks); + numAddedRowsScanTasks.inc(numAddedRowsTasks); + numDeletedRowsScanTasks.inc(numDeletedRowsTasks); + numDeletedDataFileScanTasks.inc(numDeletedFileTasks); + numUniDirectionalTasks.inc(numUniDirTasks); + numBiDirectionalTasks.inc(numBiDirTasks); + + LOG.info( + "Snapshots [{}, {}] produced {} tasks:\n\t{} AddedRowsScanTasks\n\t{} DeletedRowsScanTasks\n\t{} DeletedDataFileScanTasks\n" + + "Observed {} uni-directional tasks and {} bi-directional tasks.", + startSnapshot.getSnapshotId(), + endSnapshot.getSnapshotId(), + totalTasks, + numAddedRowsTasks, + numDeletedRowsTasks, + numDeletedFileTasks, + numUniDirTasks, + numBiDirTasks); + } + + private static boolean containsBiDirectionalChanges( + Set changeTypes) { + return changeTypes.contains(ADDED_ROWS) && changeTypes.size() > 1; } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java index 13eeb0d007da..ad7c3c4288b5 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java @@ -168,6 +168,7 @@ private void processAddedRowsTask( for (Record rec : filtered) { outputRecord( + "INSERT", rec, outputReceiver, task.getCommitSnapshotId(), @@ -195,6 +196,7 @@ private void processDeletedRowsTask( for (Record rec : newlyDeletedRecords) { // TODO: output with DELETE kind outputRecord( + "DELETE", rec, outputReceiver, task.getCommitSnapshotId(), @@ -216,6 +218,7 @@ private void processDeletedFileTask( for (Record rec : filtered) { // TODO: output with DELETE kind outputRecord( + "DELETE-DF", rec, outputReceiver, task.getCommitSnapshotId(), @@ -227,6 +230,7 @@ private void processDeletedFileTask( } private void outputRecord( + String type, Record rec, MultiOutputReceiver outputReceiver, long snapshotId, @@ -243,7 +247,7 @@ private void outputRecord( Row id = structToBeamRow(snapshotId, recId, recordIdSchema, rowAndSnapshotIDBeamSchema); outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); } else { // fast path - System.out.printf("[UNIDIRECTIONAL] -- Output(%s, %s)\n%s%n", snapshotId, timestamp, row); + System.out.printf("[UNIDIRECTIONAL] -- %s(%s, %s)\n%s%n", type, snapshotId, timestamp, row); outputReceiver.get(UNIDIRECTIONAL_ROWS).outputWithTimestamp(row, timestamp); } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java index 4b0562c2b1c2..0647e546d64a 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java @@ -36,7 +36,6 @@ public void processElement( @Timestamp Instant timestamp, OutputReceiver out) { CoGbkResult result = element.getValue(); - System.out.println("xxx [MIXED] Process timestamp: " + timestamp); // iterables are lazy-loaded from the shuffle service Iterable> deletes = result.getAll(DELETES); @@ -56,30 +55,30 @@ public void processElement( // TODO: output as UPDATE_BEFORE kind TimestampedValue updateBefore = deletesIterator.next(); out.outputWithTimestamp(updateBefore.getValue(), updateBefore.getTimestamp()); - System.out.printf("[MIXED] -- UpdateBefore\n%s\n", updateBefore); + System.out.printf("[BIDIRECTIONAL] -- UpdateBefore\n%s\n", updateBefore); // TODO: output as UPDATE_AFTER kind TimestampedValue updateAfter = insertsIterator.next(); out.outputWithTimestamp(updateAfter.getValue(), updateAfter.getTimestamp()); - System.out.printf("[MIXED] -- UpdateAfter\n%s\n", updateAfter); + System.out.printf("[BIDIRECTIONAL] -- UpdateAfter\n%s\n", updateAfter); } while (insertsIterator.hasNext()) { // TODO: output as UPDATE_AFTER kind TimestampedValue insert = insertsIterator.next(); out.outputWithTimestamp(insert.getValue(), insert.getTimestamp()); - System.out.printf("[MIXED] -- Added(extra)\n%s\n", insert); + System.out.printf("[BIDIRECTIONAL] -- Added(extra)\n%s\n", insert); } } else if (hasInserts) { // INSERT only for (TimestampedValue rec : inserts) { - System.out.printf("[MIXED] -- Added\n%s\n", rec); + System.out.printf("[BIDIRECTIONAL] -- Added\n%s\n", rec); out.outputWithTimestamp(rec.getValue(), rec.getTimestamp()); } } else if (hasDeletes) { // DELETE only for (TimestampedValue rec : deletes) { // TODO: output as DELETE kind - System.out.printf("[MIXED] -- Deleted\n%s\n", rec); + System.out.printf("[BIDIRECTIONAL] -- Deleted\n%s\n", rec); out.outputWithTimestamp(rec.getValue(), rec.getTimestamp()); } } From 615c037d1cb7c054a2ad2ff6715fcb9127f5ebea Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 29 Dec 2025 12:45:02 -0500 Subject: [PATCH 6/9] scan optimization only for pinned partitions. add javadoc to ChangelogScanner --- .../sdk/io/iceberg/cdc/ChangelogScanner.java | 124 +++++++++++++++--- 1 file changed, 104 insertions(+), 20 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java index 4ae88e837c14..d7cf462b477c 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java @@ -38,6 +38,8 @@ import org.apache.beam.sdk.values.TupleTag; import org.apache.iceberg.ChangelogScanTask; import org.apache.iceberg.IncrementalChangelogScan; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.ScanTaskGroup; import org.apache.iceberg.SerializableTable; import org.apache.iceberg.Table; @@ -48,6 +50,35 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * DoFn that takes a list of snapshots and scans for changelogs using Iceberg's {@link + * IncrementalChangelogScan} and routes them to different downstream PCollections based on + * complexity. + * + *

The Iceberg scan generates groups of changelog scan tasks, where each task belongs to a + * specific "ordinal" (a position in the sequence of table snapshots). Task grouping depends on the + * table's split-size + * property. + * + *

This DoFn analyzes the nature of changes within each ordinal and routes them to accordingly: + * + *

    + *
  1. Unidirectional (Fast Path): If an ordinal contains only inserts OR only deletes, its + * tasks are emitted to {@link #UNIDIRECTIONAL_CHANGES}. These records bypass the CoGBK + * shuffle and are output immediately. + *
  2. Bidirectional (Slow Path): If an ordinal contains a mix of inserts and deletes, its + * tasks are emitted to {@link #BIDIRECTIONAL_CHANGES}. These records are grouped by Primary + * Key and processed by {@link ReconcileChanges} to identify potential updates. + *
+ * + *

Optimization for Pinned Partitions

+ * + *

If the table's partition fields are derived entirely from Primary Key fields, we assume that a + * record will not migrate between partitions. This narrows down data locality and allows us to only + * check for bi-directional changes within a partition. Doing this will allow + * partitions with uni-directional changes to bypass the expensive CoGBK shuffle. + */ public class ChangelogScanner extends DoFn< KV>, KV>> { @@ -118,6 +149,12 @@ private void createAndOutputReadTasks( int numDeletedRowsTasks = 0; int numDeletedFileTasks = 0; + // if the record's identifier fields include all partitioned fields, we can further optimize the + // scan + // by only shuffling bi-directional changes *within* a partition. + // this is safe to do because we can assume a record change will not be a cross-partition change + boolean rowsPinnedToPartition = isRowPinnedToPartition(table.spec()); + Map cachedSnapshotTimestamps = new HashMap<>(); // Best effort maintain the same scan task groupings produced by Iceberg's binpacking, for // better work load distribution among readers. @@ -125,9 +162,12 @@ private void createAndOutputReadTasks( // https://iceberg.apache.org/docs/latest/configuration/#read-properties Map>> changelogScanTasks = new HashMap<>(); - // keep track of the types of changes in each partition. do this for each ordinal - Map>> partitionChangeTypesPerOrdinal = - new HashMap<>(); + // keep track of change types per ordinal + Map> changeTypesPerOrdinal = new HashMap<>(); + // keep track of change types per partition, per ordinal (useful if record is pinned to its + // partition) + Map>> + changeTypesPerPartitionPerOrdinal = new HashMap<>(); try (CloseableIterable> scanTaskGroups = scan.planTasks()) { for (ScanTaskGroup scanTaskGroup : scanTaskGroups) { @@ -157,10 +197,16 @@ private void createAndOutputReadTasks( break; } - partitionChangeTypesPerOrdinal - .computeIfAbsent(ordinal, (o) -> new HashMap<>()) - .computeIfAbsent(partition, (p) -> new HashSet<>()) - .add(task.getType()); + if (rowsPinnedToPartition) { + changeTypesPerPartitionPerOrdinal + .computeIfAbsent(ordinal, (o) -> new HashMap<>()) + .computeIfAbsent(partition, (p) -> new HashSet<>()) + .add(task.getType()); + } else { + changeTypesPerOrdinal + .computeIfAbsent(ordinal, (o) -> new HashSet<>()) + .add(task.getType()); + } ordinalTaskGroup.computeIfAbsent(ordinal, (o) -> new ArrayList<>()).add(task); } @@ -194,20 +240,40 @@ private void createAndOutputReadTasks( // Determine where each ordinal's tasks will go, based on the type of changes: // 1. If an ordinal's changes are unidirectional (i.e. only inserts or only deletes), they // should be processed directly in the fast path. - // 2. If an ordinal's changes are bidirectional (i.e. both inserts and deletes) within a - // partition, they will need more careful processing to determine if any updates have - // occurred. - Map> changeTypesPerPartition = - checkStateNotNull(partitionChangeTypesPerOrdinal.get(ordinal)); + // 2. If an ordinal's changes are bidirectional (i.e. both inserts and deletes), they will + // need more careful processing to determine if any updates have occurred. List uniDirTasks = new ArrayList<>(); List biDirTasks = new ArrayList<>(); - for (SerializableChangelogTask task : subgroup) { - Set partitionChangeTypes = - checkStateNotNull(changeTypesPerPartition.get(task.getDataFile().getPartitionPath())); - if (containsBiDirectionalChanges(partitionChangeTypes)) { - biDirTasks.add(task); + + // if we can guarantee no cross-partition changes, we can further drill down and only + // include + // bi-directional changes within a partition + if (rowsPinnedToPartition) { + Map> changeTypesPerPartition = + checkStateNotNull(changeTypesPerPartitionPerOrdinal.get(ordinal)); + for (SerializableChangelogTask task : subgroup) { + Set partitionChangeTypes = + checkStateNotNull( + changeTypesPerPartition.get(task.getDataFile().getPartitionPath())); + + if (containsBiDirectionalChanges(partitionChangeTypes)) { + biDirTasks.add(task); + } else { + uniDirTasks.add(task); + } + } + } else { + // otherwise, we need to look at the ordinal's changes as a whole. this is the safer + // option because a cross-partition change may occur + // (e.g. an update occurs by deleting a record in partition A, then inserting the new + // record in partition B) + Set changeTypes = + checkStateNotNull(changeTypesPerOrdinal.get(ordinal)); + + if (containsBiDirectionalChanges(changeTypes)) { + biDirTasks = subgroup; } else { - uniDirTasks.add(task); + uniDirTasks = subgroup; } } @@ -240,7 +306,7 @@ private void createAndOutputReadTasks( LOG.info( "Snapshots [{}, {}] produced {} tasks:\n\t{} AddedRowsScanTasks\n\t{} DeletedRowsScanTasks\n\t{} DeletedDataFileScanTasks\n" - + "Observed {} uni-directional tasks and {} bi-directional tasks.", + + "Observed {} uni-directional tasks and {} bi-directional tasks, using per-{} mapping.", startSnapshot.getSnapshotId(), endSnapshot.getSnapshotId(), totalTasks, @@ -248,11 +314,29 @@ private void createAndOutputReadTasks( numDeletedRowsTasks, numDeletedFileTasks, numUniDirTasks, - numBiDirTasks); + numBiDirTasks, + rowsPinnedToPartition ? "partition" : "ordinal"); } + /** Checks if a set of change types include both inserts and deletes */ private static boolean containsBiDirectionalChanges( Set changeTypes) { return changeTypes.contains(ADDED_ROWS) && changeTypes.size() > 1; } + + /** Checks if all partition fields are derived from record identifier fields. */ + private static boolean isRowPinnedToPartition(PartitionSpec spec) { + Set identifierFieldsIds = spec.schema().identifierFieldIds(); + if (spec.isUnpartitioned() || identifierFieldsIds.isEmpty()) { + return false; + } + + for (PartitionField field : spec.fields()) { + if (!identifierFieldsIds.contains(field.sourceId())) { + return false; + } + } + + return true; + } } From e228f2a253701109caac93364e951ca0cbf2071d Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 29 Dec 2025 13:01:02 -0500 Subject: [PATCH 7/9] fixes --- .../beam/sdk/io/iceberg/IcebergScanConfig.java | 2 +- .../beam/sdk/io/iceberg/ReadUtilsTest.java | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java index 3829baa43665..3c042567a29e 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java @@ -127,7 +127,7 @@ public org.apache.iceberg.Schema getRequiredSchema() { if (cachedRequiredSchema == null) { cachedRequiredSchema = resolveSchema( - getTable().schema(), + IcebergUtils.beamSchemaToIcebergSchema(getSchema()), getKeepFields(), getDropFields(), FilterUtils.getReferencedFieldNames(getFilterString())); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ReadUtilsTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ReadUtilsTest.java index 73a0fd19e893..6287a6e06197 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ReadUtilsTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ReadUtilsTest.java @@ -40,7 +40,6 @@ import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.data.Record; import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.parquet.ParquetReader; import org.checkerframework.checker.nullness.qual.Nullable; import org.junit.ClassRule; import org.junit.Rule; @@ -75,14 +74,25 @@ public void testCreateReader() throws IOException { .commit(); } + IcebergScanConfig scanConfig = + IcebergScanConfig.builder() + .setCatalogConfig( + IcebergCatalogConfig.builder() + .setCatalogProperties( + ImmutableMap.of("type", "hadoop", "warehouse", warehouse.location)) + .build()) + .setTableIdentifier(tableId) + .setSchema(IcebergUtils.icebergSchemaToBeamSchema(simpleTable.schema())) + .build(); + int numFiles = 0; try (CloseableIterable iterable = simpleTable.newScan().planTasks()) { for (CombinedScanTask combinedScanTask : iterable) { for (FileScanTask fileScanTask : combinedScanTask.tasks()) { String fileName = Iterables.getLast(Splitter.on("/").split(fileScanTask.file().path())); List recordsRead = new ArrayList<>(); - try (ParquetReader reader = - ReadUtils.createReader(fileScanTask, simpleTable, simpleTable.schema())) { + try (CloseableIterable reader = + ReadUtils.createReader(fileScanTask, simpleTable, scanConfig)) { reader.forEach(recordsRead::add); } From 24cc76d7d3d37f182f7ee3da91cb9a51bd842125 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 30 Dec 2025 11:58:51 -0500 Subject: [PATCH 8/9] add a java docs and fix spotless --- .../beam/sdk/io/iceberg/PartitionUtils.java | 2 +- .../sdk/io/iceberg/cdc/ChangelogScanner.java | 6 +- .../cdc/IncrementalChangelogSource.java | 97 +-- .../io/iceberg/cdc/ReadFromChangelogs.java | 597 ++++++++++++------ .../sdk/io/iceberg/cdc/ReconcileChanges.java | 19 +- .../cdc/SerializableChangelogTask.java | 2 +- .../beam/sdk/io/iceberg/cdc/package-info.java | 20 + 7 files changed, 461 insertions(+), 282 deletions(-) create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/package-info.java diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java index 8fbef14e3eb5..6c0230153cdf 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java @@ -100,7 +100,7 @@ static PartitionSpec toPartitionSpec( /** * Copied over from Apache Iceberg's PartitionUtil + * href="https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java">PartitionUtil. */ public static Map constantsMap( PartitionSpec spec, ContentFile file, BiFunction convertConstant) { diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java index d7cf462b477c..5aeaa3806650 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java @@ -76,8 +76,8 @@ * *

If the table's partition fields are derived entirely from Primary Key fields, we assume that a * record will not migrate between partitions. This narrows down data locality and allows us to only - * check for bi-directional changes within a partition. Doing this will allow - * partitions with uni-directional changes to bypass the expensive CoGBK shuffle. + * check for bi-directional changes within a partition. Doing this will allow partitions with + * uni-directional changes to bypass the expensive CoGBK shuffle. */ public class ChangelogScanner extends DoFn< @@ -318,7 +318,7 @@ private void createAndOutputReadTasks( rowsPinnedToPartition ? "partition" : "ordinal"); } - /** Checks if a set of change types include both inserts and deletes */ + /** Checks if a set of change types include both inserts and deletes. */ private static boolean containsBiDirectionalChanges( Set changeTypes) { return changeTypes.contains(ADDED_ROWS) && changeTypes.size() > 1; diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java index 17bc97926a62..e8ea166d9e0a 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java @@ -19,14 +19,10 @@ import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.BIDIRECTIONAL_CHANGES; import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIDIRECTIONAL_CHANGES; -import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_DELETES; -import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_INSERTS; -import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.UNIDIRECTIONAL_ROWS; import static org.apache.beam.sdk.io.iceberg.cdc.ReconcileChanges.DELETES; import static org.apache.beam.sdk.io.iceberg.cdc.ReconcileChanges.INSERTS; import java.util.List; -import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.io.iceberg.IcebergScanConfig; import org.apache.beam.sdk.io.iceberg.IcebergUtils; import org.apache.beam.sdk.io.iceberg.IncrementalScanSource; @@ -34,24 +30,24 @@ import org.apache.beam.sdk.transforms.Flatten; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Redistribute; -import org.apache.beam.sdk.transforms.Reify; import org.apache.beam.sdk.transforms.join.CoGroupByKey; import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple; -import org.apache.beam.sdk.transforms.windowing.GlobalWindows; -import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; -import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionList; import org.apache.beam.sdk.values.PCollectionTuple; import org.apache.beam.sdk.values.Row; -import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.sdk.values.TupleTagList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; import org.apache.iceberg.Table; import org.apache.iceberg.catalog.TableIdentifier; +/** + * An Iceberg source that incrementally reads a table's changelogs using range(s) of table + * snapshots. The bounded source creates a single range, while the unbounded implementation + * continuously polls for new snapshots at the specified interval. + */ public class IncrementalChangelogSource extends IncrementalScanSource { public IncrementalChangelogSource(IcebergScanConfig scanConfig) { super(scanConfig); @@ -79,81 +75,24 @@ public PCollection expand(PBegin input) { ParDo.of(new ChangelogScanner(scanConfig)) .withOutputTags( UNIDIRECTIONAL_CHANGES, TupleTagList.of(BIDIRECTIONAL_CHANGES))); + changelogTasks.get(UNIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER); + changelogTasks.get(BIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER); - // for changelog ordinal groups that have UNIDIRECTIONAL changes (i.e. all deletes, or all - // inserts), - // take the fast approach of just reading and emitting CDC records. - PCollection uniDirectionalCdcRows = - processUniDirectionalChanges( - changelogTasks.get(UNIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); + // process changelog tasks and output rows + ReadFromChangelogs.CdcOutput outputRows = + changelogTasks.apply(new ReadFromChangelogs(scanConfig)); - // changelog ordinal groups that have BIDIRECTIONAL changes (i.e. both deletes and inserts) - // will need extra processing (including a shuffle) to identify any updates + // compare bi-directional rows to identify potential updates PCollection biDirectionalCdcRows = - processBiDirectionalChanges( - changelogTasks.get(BIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER)); + KeyedPCollectionTuple.of(INSERTS, outputRows.keyedInserts()) + .and(DELETES, outputRows.keyedDeletes()) + .apply("CoGroupBy Primary Key", CoGroupByKey.create()) + .apply("Reconcile Updates-Inserts-Deletes", ParDo.of(new ReconcileChanges())) + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); - // Merge UNIDIRECTIONAL and BIDIRECTIONAL outputs - return PCollectionList.of(uniDirectionalCdcRows) + // Merge uni-directional and bi-directional outputs + return PCollectionList.of(outputRows.uniDirectionalRows()) .and(biDirectionalCdcRows) .apply(Flatten.pCollections()); } - - private PCollection processUniDirectionalChanges( - PCollection>> - uniDirectionalChangelogs) { - return uniDirectionalChangelogs - .apply(Redistribute.arbitrarily()) - .apply( - "Read UniDirectional Changes", - ParDo.of(ReadFromChangelogs.of(scanConfig)) - .withOutputTags(UNIDIRECTIONAL_ROWS, TupleTagList.empty())) - .get(UNIDIRECTIONAL_ROWS) - .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); - } - - private PCollection processBiDirectionalChanges( - PCollection>> - biDirectionalChangelogs) { - PCollectionTuple biDirectionalKeyedRows = - biDirectionalChangelogs - .apply(Redistribute.arbitrarily()) - .apply( - "Read BiDirectional Changes", - ParDo.of(ReadFromChangelogs.withKeyedOutput(scanConfig)) - .withOutputTags(KEYED_INSERTS, TupleTagList.of(KEYED_DELETES))); - - // prior to CoGBK, set a windowing strategy to maintain the earliest timestamp in the window - // this allows us to emit records downstream that may have larger reified timestamps - Window>> windowingStrategy = - Window.>>into(new GlobalWindows()) - .withTimestampCombiner(TimestampCombiner.EARLIEST); - - // preserve the element's timestamp by moving it into the value - // in the normal case, this will be a no-op because all CDC rows in an ordinal have the same - // commit timestamp. - // but this will matter if we add custom watermarking, where record timestamps are - // derived from a specified column - KvCoder keyedOutputCoder = ReadFromChangelogs.keyedOutputCoder(scanConfig); - PCollection>> keyedInsertsWithTimestamps = - biDirectionalKeyedRows - .get(KEYED_INSERTS) - .setCoder(keyedOutputCoder) - .apply("Reify INSERT Timestamps", Reify.timestampsInValue()) - .apply(windowingStrategy); - PCollection>> keyedDeletesWithTimestamps = - biDirectionalKeyedRows - .get(KEYED_DELETES) - .setCoder(keyedOutputCoder) - .apply("Reify DELETE Timestamps", Reify.timestampsInValue()) - .apply(windowingStrategy); - - // CoGroup by record ID and emit any (DELETE + INSERT) pairs as updates: (UPDATE_BEFORE, - // UPDATE_AFTER) - return KeyedPCollectionTuple.of(INSERTS, keyedInsertsWithTimestamps) - .and(DELETES, keyedDeletesWithTimestamps) - .apply("CoGroupBy Row ID", CoGroupByKey.create()) - .apply("Reconcile Inserts and Deletes", ParDo.of(new ReconcileChanges())) - .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); - } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java index ad7c3c4288b5..b06b95f1ac8d 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java @@ -18,9 +18,13 @@ package org.apache.beam.sdk.io.iceberg.cdc; import static org.apache.beam.sdk.io.iceberg.IcebergUtils.icebergSchemaToBeamSchema; +import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.BIDIRECTIONAL_CHANGES; +import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIDIRECTIONAL_CHANGES; import java.io.IOException; import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.io.iceberg.IcebergScanConfig; import org.apache.beam.sdk.io.iceberg.IcebergUtils; @@ -32,10 +36,24 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Redistribute; +import org.apache.beam.sdk.transforms.Reify; import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker; +import org.apache.beam.sdk.transforms.windowing.GlobalWindows; +import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; +import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.iceberg.StructLike; @@ -47,55 +65,54 @@ import org.apache.iceberg.util.StructProjection; import org.joda.time.Instant; -@DoFn.BoundedPerElement -public class ReadFromChangelogs - extends DoFn>, OutT> { - public static final TupleTag UNIDIRECTIONAL_ROWS = new TupleTag<>(); - public static final TupleTag> KEYED_INSERTS = new TupleTag<>(); - public static final TupleTag> KEYED_DELETES = new TupleTag<>(); - - private final Counter numAddedRowsScanTasksCompleted = +/** + * A {@link PTransform} that processed {@link org.apache.iceberg.ChangelogScanTask}s. They come in + * three types: + * + *

    + *
  1. AddedRowsScanTask: Indicates records have been inserted via a new DataFile. + *
  2. DeletedRowsScanTask: Indicates records have been deleted via a Position DeleteFile + * or Equality DeleteFile. + *
  3. DeletedDataFileScanTask: Indicates a whole DataFile has been deleted. + *
+ * + * Each of these ChangelogScanTasks need to be processed differently. More details in the + * corresponding methods: + * + *
    + *
  1. {@link ReadDoFn#processAddedRowsTask(SerializableChangelogTask, Table, + * DoFn.MultiOutputReceiver)} + *
  2. {@link ReadDoFn#processDeletedRowsTask(SerializableChangelogTask, Table, + * DoFn.MultiOutputReceiver)} + *
  3. {@link ReadDoFn#processDeletedFileTask(SerializableChangelogTask, Table, + * DoFn.MultiOutputReceiver)} + *
+ */ +class ReadFromChangelogs extends PTransform { + private static final Counter numAddedRowsScanTasksCompleted = Metrics.counter(ReadFromChangelogs.class, "numAddedRowsScanTasksCompleted"); - private final Counter numDeletedRowsScanTasksCompleted = + private static final Counter numDeletedRowsScanTasksCompleted = Metrics.counter(ReadFromChangelogs.class, "numDeletedRowsScanTasksCompleted"); - private final Counter numDeletedDataFileScanTasksCompleted = + private static final Counter numDeletedDataFileScanTasksCompleted = Metrics.counter(ReadFromChangelogs.class, "numDeletedDataFileScanTasksCompleted"); + private static final TupleTag UNIDIRECTIONAL_ROWS = new TupleTag<>(); + private static final TupleTag> KEYED_INSERTS = new TupleTag<>(); + private static final TupleTag> KEYED_DELETES = new TupleTag<>(); + private final IcebergScanConfig scanConfig; - private final boolean keyedOutput; - private transient StructProjection recordIdProjection; - private transient org.apache.iceberg.Schema recordIdSchema; - private final Schema beamRowSchema; private final Schema rowAndSnapshotIDBeamSchema; + // TODO: Any better way of doing this? private static final String SNAPSHOT_FIELD = "__beam__changelog__snapshot__id__"; - private ReadFromChangelogs(IcebergScanConfig scanConfig, boolean keyedOutput) { + ReadFromChangelogs(IcebergScanConfig scanConfig) { this.scanConfig = scanConfig; - this.keyedOutput = keyedOutput; - - this.beamRowSchema = icebergSchemaToBeamSchema(scanConfig.getProjectedSchema()); - org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); - this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); - this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); - this.rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig); } - static ReadFromChangelogs of(IcebergScanConfig scanConfig) { - return new ReadFromChangelogs<>(scanConfig, false); - } - - static ReadFromChangelogs> withKeyedOutput(IcebergScanConfig scanConfig) { - return new ReadFromChangelogs<>(scanConfig, true); - } - - /** - * Determines the keyed output coder, which depends on the requested projected schema and the - * schema's identifier fields. - */ - static KvCoder keyedOutputCoder(IcebergScanConfig scanConfig) { + /** Computes the keyed output coder, which depends on the table's primary key spec. */ + private KvCoder keyedOutputCoder(IcebergScanConfig scanConfig) { org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); - Schema rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig); return KvCoder.of( SchemaCoder.of(rowAndSnapshotIDBeamSchema), SchemaCoder.of(icebergSchemaToBeamSchema(recordSchema))); @@ -114,186 +131,380 @@ private static Schema rowAndSnapshotIDBeamSchema(IcebergScanConfig scanConfig) { return new Schema(fields); } - @Setup - public void setup() { - // StructProjection is not serializable, so we need to recompute it when the DoFn gets - // deserialized - org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); - this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); - this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); + @Override + public CdcOutput expand(PCollectionTuple input) { + PCollection>> uniDirectionalChanges = + input.get(UNIDIRECTIONAL_CHANGES); + PCollection>> biDirectionalChanges = + input.get(BIDIRECTIONAL_CHANGES); + + // === UNIDIRECTIONAL changes === + // (i.e. only deletes, or only inserts) + // take the fast approach of just reading and emitting CDC records. + PCollection uniDirectionalCdcRows = + uniDirectionalChanges + .apply(Redistribute.arbitrarily()) + .apply( + "Read Uni-Directional Changes", + ParDo.of(ReadDoFn.unidirectional(scanConfig)) + .withOutputTags(UNIDIRECTIONAL_ROWS, TupleTagList.empty())) + .get(UNIDIRECTIONAL_ROWS) + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema())); + + // === BIDIRECTIONAL changes === + // (i.e. a mix of deletes and inserts) + // will need to be prepared for a downstream CoGBK shuffle to identify potential updates + PCollectionTuple biDirectionalKeyedCdcRows = + biDirectionalChanges + .apply(Redistribute.arbitrarily()) + .apply( + "Read Bi-Directional Changes", + ParDo.of(ReadDoFn.bidirectional(scanConfig)) + .withOutputTags(KEYED_INSERTS, TupleTagList.of(KEYED_DELETES))); + + // set a windowing strategy to maintain the earliest timestamp + // this allows us to emit records afterward that may have larger reified timestamps + Window>> windowingStrategy = + Window.>>into(new GlobalWindows()) + .withTimestampCombiner(TimestampCombiner.EARLIEST); + + // Reify to preserve the element's timestamp. This is currently a no-op because we are + // setting the ordinal's commit timestamp for all records. + // But this will matter if user configures a watermark column to derive + // timestamps from (not supported yet) + KvCoder keyedOutputCoder = keyedOutputCoder(scanConfig); + PCollection>> keyedInsertsWithTimestamps = + biDirectionalKeyedCdcRows + .get(KEYED_INSERTS) + .setCoder(keyedOutputCoder) + .apply("Reify INSERT Timestamps", Reify.timestampsInValue()) + .apply("Re-window INSERTs", windowingStrategy); + PCollection>> keyedDeletesWithTimestamps = + biDirectionalKeyedCdcRows + .get(KEYED_DELETES) + .setCoder(keyedOutputCoder) + .apply("Reify DELETE Timestamps", Reify.timestampsInValue()) + .apply("Re-window DELETEs", windowingStrategy); + + return new CdcOutput( + input.getPipeline(), + uniDirectionalCdcRows, + keyedInsertsWithTimestamps, + keyedDeletesWithTimestamps); } - @ProcessElement - public void process( - @Element KV> element, - RestrictionTracker tracker, - MultiOutputReceiver out) - throws IOException { - // TODO: use TableCache - Table table = scanConfig.getTable(); - table.refresh(); - - List tasks = element.getValue(); - - for (long l = tracker.currentRestriction().getFrom(); - l < tracker.currentRestriction().getTo(); - l++) { - if (!tracker.tryClaim(l)) { - return; - } + public static class CdcOutput implements POutput { + private final Pipeline pipeline; + private final PCollection uniDirectionalRows; + private final PCollection>> keyedInserts; + private final PCollection>> keyedDeletes; - SerializableChangelogTask task = tasks.get((int) l); - switch (task.getType()) { - case ADDED_ROWS: - processAddedRowsTask(task, table, out); - break; - case DELETED_ROWS: - processDeletedRowsTask(task, table, out); - break; - case DELETED_FILE: - processDeletedFileTask(task, table, out); - break; - } + CdcOutput( + Pipeline p, + PCollection uniDirectionalRows, + PCollection>> keyedInserts, + PCollection>> keyedDeletes) { + this.pipeline = p; + this.uniDirectionalRows = uniDirectionalRows; + this.keyedInserts = keyedInserts; + this.keyedDeletes = keyedDeletes; } + + PCollection uniDirectionalRows() { + return uniDirectionalRows; + } + + PCollection>> keyedInserts() { + return keyedInserts; + } + + PCollection>> keyedDeletes() { + return keyedDeletes; + } + + @Override + public Pipeline getPipeline() { + return pipeline; + } + + @Override + public Map, PValue> expand() { + return ImmutableMap.of( + UNIDIRECTIONAL_ROWS, + uniDirectionalRows, + KEYED_INSERTS, + keyedInserts, + KEYED_DELETES, + keyedDeletes); + } + + @Override + public void finishSpecifyingOutput( + String transformName, PInput input, PTransform transform) {} } - private void processAddedRowsTask( - SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) - throws IOException { - try (CloseableIterable fullIterable = ReadUtils.createReader(task, table, scanConfig)) { - DeleteFilter deleteFilter = - ReadUtils.genericDeleteFilter( - table, scanConfig, task.getDataFile().getPath(), task.getAddedDeletes()); - CloseableIterable filtered = deleteFilter.filter(fullIterable); - - for (Record rec : filtered) { - outputRecord( - "INSERT", - rec, - outputReceiver, - task.getCommitSnapshotId(), - task.getTimestampMillis(), - KEYED_INSERTS); + @DoFn.BoundedPerElement + private static class ReadDoFn + extends DoFn>, OutT> { + private final IcebergScanConfig scanConfig; + private final boolean keyedOutput; + private transient StructProjection recordIdProjection; + private transient org.apache.iceberg.Schema recordIdSchema; + private final Schema beamRowSchema; + private final Schema rowAndSnapshotIDBeamSchema; + + /** Used for uni-directional changes. Records are output immediately as-is. */ + static ReadDoFn unidirectional(IcebergScanConfig scanConfig) { + return new ReadDoFn<>(scanConfig, false); + } + + /** + * Used for bi-directional changes. Records are keyed by (primary key, snapshot ID) and sent to + * a CoGBK. + */ + static ReadDoFn> bidirectional(IcebergScanConfig scanConfig) { + return new ReadDoFn<>(scanConfig, true); + } + + private ReadDoFn(IcebergScanConfig scanConfig, boolean keyedOutput) { + this.scanConfig = scanConfig; + this.keyedOutput = keyedOutput; + + this.beamRowSchema = icebergSchemaToBeamSchema(scanConfig.getProjectedSchema()); + org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); + this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); + this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); + + this.rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig); + } + + @Setup + public void setup() { + // StructProjection is not serializable, so we need to recompute it when the DoFn gets + // deserialized + org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); + this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); + this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); + } + + @ProcessElement + public void process( + @Element KV> element, + RestrictionTracker tracker, + MultiOutputReceiver out) + throws IOException { + // TODO: use TableCache + Table table = scanConfig.getTable(); + table.refresh(); + + List tasks = element.getValue(); + + for (long l = tracker.currentRestriction().getFrom(); + l < tracker.currentRestriction().getTo(); + l++) { + if (!tracker.tryClaim(l)) { + return; + } + + SerializableChangelogTask task = tasks.get((int) l); + switch (task.getType()) { + case ADDED_ROWS: + processAddedRowsTask(task, table, out); + break; + case DELETED_ROWS: + processDeletedRowsTask(task, table, out); + break; + case DELETED_FILE: + processDeletedFileTask(task, table, out); + break; + } } } - numAddedRowsScanTasksCompleted.inc(); - } - private void processDeletedRowsTask( - SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) - throws IOException { - DeleteFilter existingDeletesFilter = - ReadUtils.genericDeleteFilter( - table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes()); - DeleteReader newDeletesReader = - ReadUtils.genericDeleteReader( - table, scanConfig, task.getDataFile().getPath(), task.getAddedDeletes()); - - try (CloseableIterable allRecords = ReadUtils.createReader(task, table, scanConfig)) { - CloseableIterable liveRecords = existingDeletesFilter.filter(allRecords); - CloseableIterable newlyDeletedRecords = newDeletesReader.read(liveRecords); - - for (Record rec : newlyDeletedRecords) { - // TODO: output with DELETE kind - outputRecord( - "DELETE", - rec, - outputReceiver, - task.getCommitSnapshotId(), - task.getTimestampMillis(), - KEYED_DELETES); + /** + * 1. Reads the added DataFile. 2. Filters out any matching deletes. This may happen if a + * matching position delete file is committed in the same snapshot or if changes for multiple + * snapshots are squashed together. 3. Outputs record. + */ + private void processAddedRowsTask( + SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) + throws IOException { + try (CloseableIterable fullIterable = + ReadUtils.createReader(task, table, scanConfig)) { + + // TODO: AddedRowsScanTask comes with a datafile and potential deletes on that new datafile + // (that happened in the same commit). + // Should we: + // 1. Only output the (non-deleted) inserted records? + // 2. Or output all inserted records and also all deleted records? + // Currently we do 1 (only output what is actually 'inserted' in this commit). + DeleteFilter deleteFilter = + ReadUtils.genericDeleteFilter( + table, scanConfig, task.getDataFile().getPath(), task.getAddedDeletes()); + CloseableIterable filtered = deleteFilter.filter(fullIterable); + + for (Record rec : filtered) { + outputRecord( + "INSERT", + rec, + outputReceiver, + task.getCommitSnapshotId(), + task.getTimestampMillis(), + KEYED_INSERTS); + } } + numAddedRowsScanTasksCompleted.inc(); } - numDeletedRowsScanTasksCompleted.inc(); - } - private void processDeletedFileTask( - SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) - throws IOException { - try (CloseableIterable fullIterable = ReadUtils.createReader(task, table, scanConfig)) { - DeleteFilter deleteFilter = + /** + * + * + *
    + *
  1. 1. Fetches the referenced DataFile (that deletes will be applied to) and iterates over + * records. + *
  2. 2. Applies a filter to ignore any existing deletes. + *
  3. 3. Applies a filter to read only the new deletes. + *
  4. 4. Outputs records with delete row kind. + *
+ */ + private void processDeletedRowsTask( + SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) + throws IOException { + DeleteFilter existingDeletesFilter = ReadUtils.genericDeleteFilter( table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes()); - CloseableIterable filtered = deleteFilter.filter(fullIterable); - for (Record rec : filtered) { - // TODO: output with DELETE kind - outputRecord( - "DELETE-DF", - rec, - outputReceiver, - task.getCommitSnapshotId(), - task.getTimestampMillis(), - KEYED_DELETES); + DeleteReader newDeletesReader = + ReadUtils.genericDeleteReader( + table, scanConfig, task.getDataFile().getPath(), task.getAddedDeletes()); + + try (CloseableIterable allRecords = ReadUtils.createReader(task, table, scanConfig)) { + CloseableIterable liveRecords = existingDeletesFilter.filter(allRecords); + CloseableIterable newlyDeletedRecords = newDeletesReader.read(liveRecords); + + for (Record rec : newlyDeletedRecords) { + // TODO: output with DELETE kind + outputRecord( + "DELETE", + rec, + outputReceiver, + task.getCommitSnapshotId(), + task.getTimestampMillis(), + KEYED_DELETES); + } } + numDeletedRowsScanTasksCompleted.inc(); } - numDeletedDataFileScanTasksCompleted.inc(); - } - private void outputRecord( - String type, - Record rec, - MultiOutputReceiver outputReceiver, - long snapshotId, - long timestampMillis, - TupleTag> keyedTag) { - Row row = IcebergUtils.icebergRecordToBeamRow(beamRowSchema, rec); - Instant timestamp = Instant.ofEpochMilli(timestampMillis); - if (keyedOutput) { // slow path - StructProjection recId = recordIdProjection.wrap(rec); - // Create a Row ID consisting of: - // 1. the task's commit snapshot ID - // 2. the record ID column values - // This is needed to sufficiently distinguish a record change - Row id = structToBeamRow(snapshotId, recId, recordIdSchema, rowAndSnapshotIDBeamSchema); - outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); - } else { // fast path - System.out.printf("[UNIDIRECTIONAL] -- %s(%s, %s)\n%s%n", type, snapshotId, timestamp, row); - outputReceiver.get(UNIDIRECTIONAL_ROWS).outputWithTimestamp(row, timestamp); + /** + * + * + *
    + *
  1. 1. Fetches the referenced DataFile (that deletes will be applied to) and iterates over + * records. + *
  2. 2. Applies a filter to ignore any existing deletes. + *
  3. 4. Outputs records with delete row kind. + *
+ */ + private void processDeletedFileTask( + SerializableChangelogTask task, Table table, MultiOutputReceiver outputReceiver) + throws IOException { + try (CloseableIterable fullIterable = + ReadUtils.createReader(task, table, scanConfig)) { + DeleteFilter deleteFilter = + ReadUtils.genericDeleteFilter( + table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes()); + CloseableIterable filtered = deleteFilter.filter(fullIterable); + for (Record rec : filtered) { + // TODO: output with DELETE kind + outputRecord( + "DELETE-DF", + rec, + outputReceiver, + task.getCommitSnapshotId(), + task.getTimestampMillis(), + KEYED_DELETES); + } + } + numDeletedDataFileScanTasksCompleted.inc(); } - } - public static Row structToBeamRow( - long snapshotId, StructLike struct, org.apache.iceberg.Schema schema, Schema beamSchema) { - ImmutableMap.Builder values = ImmutableMap.builder(); - List columns = schema.columns(); - for (Types.NestedField column : columns) { - String name = column.name(); - Object value = schema.accessorForField(column.fieldId()).get(struct); - values.put(name, value); + /** + * Outputs records to the appropriate downstream collection. + * + *

If this DoFn is configured for uni-directional changes, records are output directly to the + * {@link ReadFromChangelogs#UNIDIRECTIONAL_ROWS} tag. + * + *

If this DoFn is configured for bi-directional changes, records will be keyed by their + * Primary Key and commit snapshot ID, then output to either {@link + * ReadFromChangelogs#KEYED_INSERTS} or {@link ReadFromChangelogs#KEYED_DELETES}. + */ + private void outputRecord( + String type, + Record rec, + MultiOutputReceiver outputReceiver, + long snapshotId, + long timestampMillis, + TupleTag> keyedTag) { + Row row = IcebergUtils.icebergRecordToBeamRow(beamRowSchema, rec); + Instant timestamp = Instant.ofEpochMilli(timestampMillis); + if (keyedOutput) { // slow path + StructProjection recId = recordIdProjection.wrap(rec); + // Create a Row ID consisting of: + // 1. the task's commit snapshot ID + // 2. the record ID column values + // This is needed to sufficiently distinguish a record change + Row id = structToBeamRow(snapshotId, recId, recordIdSchema, rowAndSnapshotIDBeamSchema); + outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); + } else { // fast path + System.out.printf("[UNIDIRECTIONAL] -- %s(%s, %s)\n%s%n", type, snapshotId, timestamp, row); + outputReceiver.get(UNIDIRECTIONAL_ROWS).outputWithTimestamp(row, timestamp); + } } - // Include snapshot ID as part of the row ID. - // This is essential to ensure that the downstream ReconcileChanges compares rows - // within the same operation. - values.put(SNAPSHOT_FIELD, snapshotId); - return Row.withSchema(beamSchema).withFieldValues(values.build()).build(); - } - @GetSize - public double getSize( - @Element KV> element, - @Restriction OffsetRange restriction) { - // TODO(ahmedabu98): this is just the compressed byte size. find a way to make a better estimate - long size = 0; - - for (long l = restriction.getFrom(); l < restriction.getTo(); l++) { - SerializableChangelogTask task = element.getValue().get((int) l); - size += task.getDataFile().getFileSizeInBytes(); - size += - task.getAddedDeletes().stream() - .mapToLong(SerializableDeleteFile::getFileSizeInBytes) - .sum(); - size += - task.getExistingDeletes().stream() - .mapToLong(SerializableDeleteFile::getFileSizeInBytes) - .sum(); + public static Row structToBeamRow( + long snapshotId, StructLike struct, org.apache.iceberg.Schema schema, Schema beamSchema) { + ImmutableMap.Builder values = ImmutableMap.builder(); + List columns = schema.columns(); + for (Types.NestedField column : columns) { + String name = column.name(); + Object value = schema.accessorForField(column.fieldId()).get(struct); + values.put(name, value); + } + // Include snapshot ID as part of the row ID. + // This is essential to ensure that the downstream ReconcileChanges compares rows + // within the same operation. + values.put(SNAPSHOT_FIELD, snapshotId); + return Row.withSchema(beamSchema).withFieldValues(values.build()).build(); } - return size; - } + @GetSize + public double getSize( + @Element KV> element, + @Restriction OffsetRange restriction) { + // TODO(ahmedabu98): this is just the compressed byte size. find a way to make a better byte + // size estimate + long size = 0; + + for (long l = restriction.getFrom(); l < restriction.getTo(); l++) { + SerializableChangelogTask task = element.getValue().get((int) l); + size += task.getDataFile().getFileSizeInBytes(); + size += + task.getAddedDeletes().stream() + .mapToLong(SerializableDeleteFile::getFileSizeInBytes) + .sum(); + size += + task.getExistingDeletes().stream() + .mapToLong(SerializableDeleteFile::getFileSizeInBytes) + .sum(); + } + + return size; + } - @GetInitialRestriction - public OffsetRange getInitialRange( - @Element KV> element) { - return new OffsetRange(0, element.getValue().size()); + @GetInitialRestriction + public OffsetRange getInitialRange( + @Element KV> element) { + return new OffsetRange(0, element.getValue().size()); + } } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java index 0647e546d64a..1afa5add3d32 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java @@ -24,17 +24,26 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.sdk.values.TupleTag; -import org.joda.time.Instant; +/** + * Receives inserts and deletes, keyed by snapshot ID and Primary Key, and determines if any updates + * have occurred. + * + *

If the element has a mix of inserts and deletes, it is considered an update. INSERT becomes + * UPDATE_BEFORE and DELETE becomes UPDATE_AFTER. + * + *

Otherwise, records are output as-is: INSERT as INSERT, and DELETE as DELETE. + * + *

Input elements have their timestamp reified. This is because CoGroupByKey assigns all elements + * in a window with the same timestamp, erasing individual record timestamps. This DoFn preserves it + * by outputting records with their reified timestamps. + */ public class ReconcileChanges extends DoFn, Row> { public static final TupleTag> DELETES = new TupleTag<>() {}; public static final TupleTag> INSERTS = new TupleTag<>() {}; @DoFn.ProcessElement - public void processElement( - @Element KV element, - @Timestamp Instant timestamp, - OutputReceiver out) { + public void processElement(@Element KV element, OutputReceiver out) { CoGbkResult result = element.getValue(); // iterables are lazy-loaded from the shuffle service diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java index 6d5920ae5a7e..07510a3e54db 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/SerializableChangelogTask.java @@ -17,7 +17,7 @@ */ package org.apache.beam.sdk.io.iceberg.cdc; -import static com.google.common.base.Preconditions.checkState; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import com.google.auto.value.AutoValue; import java.util.Collections; diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/package-info.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/package-info.java new file mode 100644 index 000000000000..8285d91689be --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Iceberg CDC connectors. */ +package org.apache.beam.sdk.io.iceberg.cdc; From 102708b0a7b240a9a807e41031ddb79c0d984ae6 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 30 Dec 2025 13:47:41 -0500 Subject: [PATCH 9/9] fix spotless and existing tests --- .../sdk/io/iceberg/IcebergScanConfig.java | 2 +- .../apache/beam/sdk/io/iceberg/ReadUtils.java | 50 ------------------- .../sdk/io/iceberg/SerializableDataFile.java | 2 +- .../beam/sdk/io/iceberg/TableCache.java | 8 +-- .../sdk/io/iceberg/cdc/ChangelogScanner.java | 6 +-- .../io/iceberg/cdc/ReadFromChangelogs.java | 24 ++++----- .../sdk/io/iceberg/cdc/ReconcileChanges.java | 10 ++-- .../io/iceberg/SerializableDataFileTest.java | 3 ++ 8 files changed, 29 insertions(+), 76 deletions(-) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java index 3c042567a29e..3829baa43665 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java @@ -127,7 +127,7 @@ public org.apache.iceberg.Schema getRequiredSchema() { if (cachedRequiredSchema == null) { cachedRequiredSchema = resolveSchema( - IcebergUtils.beamSchemaToIcebergSchema(getSchema()), + getTable().schema(), getKeepFields(), getDropFields(), FilterUtils.getReferencedFieldNames(getFilterString())); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java index e918902ea47d..4a319663f994 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadUtils.java @@ -290,31 +290,6 @@ public BeamDeleteFilter( this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); } - // TODO: remove this (unused) - @SuppressWarnings("method.invocation") - public BeamDeleteFilter( - FileIO io, - SerializableChangelogTask scanTask, - Schema tableSchema, - Schema projectedSchema, - List deleteFiles) { - super(scanTask.getDataFile().getPath(), deleteFiles, tableSchema, projectedSchema); - this.io = io; - this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); - } - - // TODO: remove this (unused) - @SuppressWarnings("method.invocation") - public BeamDeleteFilter(FileIO io, ContentScanTask scanTask, List deleteFiles) { - super( - scanTask.file().location(), - deleteFiles, - scanTask.spec().schema(), - scanTask.spec().schema()); - this.io = io; - this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); - } - @Override protected StructLike asStructLike(Record record) { return asStructLike.wrap(record); @@ -342,31 +317,6 @@ public BeamDeleteReader( this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); } - // TODO: remove this (unused) - @SuppressWarnings("method.invocation") - public BeamDeleteReader( - FileIO io, - SerializableChangelogTask scanTask, - Schema tableSchema, - Schema projectedSchema, - List deleteFiles) { - super(scanTask.getDataFile().getPath(), deleteFiles, tableSchema, projectedSchema); - this.io = io; - this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); - } - - // TODO: remove this (unused) - @SuppressWarnings("method.invocation") - public BeamDeleteReader(FileIO io, ContentScanTask scanTask, List deleteFiles) { - super( - scanTask.file().location(), - deleteFiles, - scanTask.spec().schema(), - scanTask.spec().schema()); - this.io = io; - this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); - } - @Override protected StructLike asStructLike(Record record) { return asStructLike.wrap(record); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java index ba3e4bfb59db..2a80fea352e7 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SerializableDataFile.java @@ -111,7 +111,7 @@ public static Builder builder() { public abstract @Nullable Long getFirstRowId(); @AutoValue.Builder - abstract static class Builder { + public abstract static class Builder { abstract Builder setPath(String path); abstract Builder setFileFormat(String fileFormat); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/TableCache.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/TableCache.java index cb00d90f7fb3..d9d8802e2b49 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/TableCache.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/TableCache.java @@ -33,7 +33,7 @@ import org.apache.iceberg.catalog.TableIdentifier; /** Utility to fetch and cache Iceberg {@link Table}s. */ -class TableCache { +public class TableCache { private static final Map CATALOG_CACHE = new ConcurrentHashMap<>(); private static final LoadingCache INTERNAL_CACHE = CacheBuilder.newBuilder() @@ -55,7 +55,7 @@ public ListenableFuture reload(String unusedIdentifier, Table table) { } });; - static Table get(String identifier) { + public static Table get(String identifier) { try { return INTERNAL_CACHE.get(identifier); } catch (ExecutionException e) { @@ -65,12 +65,12 @@ static Table get(String identifier) { } /** Forces a table refresh and returns. */ - static Table getRefreshed(String identifier) { + public static Table getRefreshed(String identifier) { INTERNAL_CACHE.refresh(identifier); return get(identifier); } - static void setup(IcebergScanConfig scanConfig) { + public static void setup(IcebergScanConfig scanConfig) { String tableIdentifier = scanConfig.getTableIdentifier(); IcebergCatalogConfig catalogConfig = scanConfig.getCatalogConfig(); if (CATALOG_CACHE.containsKey(tableIdentifier)) { diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java index 5aeaa3806650..921fa303081f 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java @@ -31,6 +31,7 @@ import org.apache.beam.sdk.coders.ListCoder; import org.apache.beam.sdk.io.iceberg.IcebergScanConfig; import org.apache.beam.sdk.io.iceberg.SnapshotInfo; +import org.apache.beam.sdk.io.iceberg.TableCache; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.transforms.DoFn; @@ -105,14 +106,13 @@ public class ChangelogScanner ChangelogScanner(IcebergScanConfig scanConfig) { this.scanConfig = scanConfig; + TableCache.setup(scanConfig); } @ProcessElement public void process(@Element KV> element, MultiOutputReceiver out) throws IOException { - // TODO: use TableCache here - Table table = scanConfig.getTable(); - table.refresh(); + Table table = TableCache.getRefreshed(scanConfig.getTableIdentifier()); List snapshots = element.getValue(); SnapshotInfo startSnapshot = snapshots.get(0); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java index b06b95f1ac8d..fb71d95faa2f 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java @@ -20,6 +20,7 @@ import static org.apache.beam.sdk.io.iceberg.IcebergUtils.icebergSchemaToBeamSchema; import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.BIDIRECTIONAL_CHANGES; import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIDIRECTIONAL_CHANGES; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import java.io.IOException; import java.util.List; @@ -30,6 +31,7 @@ import org.apache.beam.sdk.io.iceberg.IcebergUtils; import org.apache.beam.sdk.io.iceberg.ReadUtils; import org.apache.beam.sdk.io.iceberg.SerializableDeleteFile; +import org.apache.beam.sdk.io.iceberg.TableCache; import org.apache.beam.sdk.io.range.OffsetRange; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; @@ -63,6 +65,7 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.StructProjection; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.joda.time.Instant; /** @@ -248,8 +251,8 @@ private static class ReadDoFn extends DoFn>, OutT> { private final IcebergScanConfig scanConfig; private final boolean keyedOutput; - private transient StructProjection recordIdProjection; - private transient org.apache.iceberg.Schema recordIdSchema; + private transient @MonotonicNonNull StructProjection recordIdProjection; + private transient org.apache.iceberg.@MonotonicNonNull Schema recordIdSchema; private final Schema beamRowSchema; private final Schema rowAndSnapshotIDBeamSchema; @@ -271,10 +274,6 @@ private ReadDoFn(IcebergScanConfig scanConfig, boolean keyedOutput) { this.keyedOutput = keyedOutput; this.beamRowSchema = icebergSchemaToBeamSchema(scanConfig.getProjectedSchema()); - org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); - this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); - this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); - this.rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig); } @@ -285,6 +284,7 @@ public void setup() { org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema(); this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames()); this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema); + TableCache.setup(scanConfig); } @ProcessElement @@ -293,9 +293,7 @@ public void process( RestrictionTracker tracker, MultiOutputReceiver out) throws IOException { - // TODO: use TableCache - Table table = scanConfig.getTable(); - table.refresh(); + Table table = TableCache.get(scanConfig.getTableIdentifier()); List tasks = element.getValue(); @@ -448,15 +446,17 @@ private void outputRecord( Row row = IcebergUtils.icebergRecordToBeamRow(beamRowSchema, rec); Instant timestamp = Instant.ofEpochMilli(timestampMillis); if (keyedOutput) { // slow path - StructProjection recId = recordIdProjection.wrap(rec); + StructProjection recId = checkStateNotNull(recordIdProjection).wrap(rec); // Create a Row ID consisting of: // 1. the task's commit snapshot ID // 2. the record ID column values // This is needed to sufficiently distinguish a record change - Row id = structToBeamRow(snapshotId, recId, recordIdSchema, rowAndSnapshotIDBeamSchema); + Row id = + structToBeamRow( + snapshotId, recId, checkStateNotNull(recordIdSchema), rowAndSnapshotIDBeamSchema); outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp); } else { // fast path - System.out.printf("[UNIDIRECTIONAL] -- %s(%s, %s)\n%s%n", type, snapshotId, timestamp, row); + System.out.printf("[UNIDIRECTIONAL] -- %s(%s, %s)%n%s%n", type, snapshotId, timestamp, row); outputReceiver.get(UNIDIRECTIONAL_ROWS).outputWithTimestamp(row, timestamp); } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java index 1afa5add3d32..de56efa8f737 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java @@ -64,30 +64,30 @@ public void processElement(@Element KV element, OutputReceiver // TODO: output as UPDATE_BEFORE kind TimestampedValue updateBefore = deletesIterator.next(); out.outputWithTimestamp(updateBefore.getValue(), updateBefore.getTimestamp()); - System.out.printf("[BIDIRECTIONAL] -- UpdateBefore\n%s\n", updateBefore); + System.out.printf("[BIDIRECTIONAL] -- UpdateBefore%n%s%n", updateBefore); // TODO: output as UPDATE_AFTER kind TimestampedValue updateAfter = insertsIterator.next(); out.outputWithTimestamp(updateAfter.getValue(), updateAfter.getTimestamp()); - System.out.printf("[BIDIRECTIONAL] -- UpdateAfter\n%s\n", updateAfter); + System.out.printf("[BIDIRECTIONAL] -- UpdateAfter%n%s%n", updateAfter); } while (insertsIterator.hasNext()) { // TODO: output as UPDATE_AFTER kind TimestampedValue insert = insertsIterator.next(); out.outputWithTimestamp(insert.getValue(), insert.getTimestamp()); - System.out.printf("[BIDIRECTIONAL] -- Added(extra)\n%s\n", insert); + System.out.printf("[BIDIRECTIONAL] -- Added(extra)%n%s%n", insert); } } else if (hasInserts) { // INSERT only for (TimestampedValue rec : inserts) { - System.out.printf("[BIDIRECTIONAL] -- Added\n%s\n", rec); + System.out.printf("[BIDIRECTIONAL] -- Added%n%s%n", rec); out.outputWithTimestamp(rec.getValue(), rec.getTimestamp()); } } else if (hasDeletes) { // DELETE only for (TimestampedValue rec : deletes) { // TODO: output as DELETE kind - System.out.printf("[BIDIRECTIONAL] -- Deleted\n%s\n", rec); + System.out.printf("[BIDIRECTIONAL] -- Deleted%n%s%n", rec); out.outputWithTimestamp(rec.getValue(), rec.getTimestamp()); } } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/SerializableDataFileTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/SerializableDataFileTest.java index 983f021fd7ce..58bc55744ec6 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/SerializableDataFileTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/SerializableDataFileTest.java @@ -47,6 +47,9 @@ public class SerializableDataFileTest { .add("nanValueCounts") .add("lowerBounds") .add("upperBounds") + .add("dataSequenceNumber") + .add("fileSequenceNumber") + .add("firstRowId") .build(); @Test