[Iceberg] Support nested column paths in keep/drop configuration (#37516)

PDGGK · web-flow · commit d6fd06987690 · 2026-04-15T15:01:30.000-04:00
* [Iceberg] Support nested column paths in keep/drop configuration This change fixes the validation logic in IcebergScanConfig to support nested column paths using dot notation (e.g., "data.name"). Previously, the validation only checked top-level column names, causing nested paths like "colA.colB" to fail with "unknown field(s)" error. The fix uses Iceberg's Schema.findField() which natively resolves dot-notation paths for nested fields. Fixes #37486 * [Iceberg] Also fix drop functionality for nested column paths - Use TypeUtil.indexByName() to enumerate all field paths - Only select leaf fields to prevent parent struct from including dropped children - Add test for nested drop validation * Apply Spotless formatting * Rework nested column pruning to match Beam code style - Replace inline fully-qualified names with proper imports - Use TypeUtil.indexNameById (canonical paths only) instead of indexByName (which includes short aliases that break drop logic) - Remove verbose line-by-line comments - Consolidate tests into a single method using sameSchema() assertions to match the existing testProjectedSchema() pattern
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java
@@ -24,9 +24,9 @@
 import com.google.auto.value.AutoValue;
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Set;
-import java.util.stream.Collectors;
 import org.apache.beam.sdk.io.iceberg.IcebergIO.ReadRows.StartingStrategy;
 import org.apache.beam.sdk.schemas.Schema;
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
@@ -37,7 +37,7 @@
 import org.apache.iceberg.catalog.TableIdentifier;
 import org.apache.iceberg.expressions.Evaluator;
 import org.apache.iceberg.expressions.Expression;
-import org.apache.iceberg.types.Types;
+import org.apache.iceberg.types.TypeUtil;
 import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
 import org.checkerframework.checker.nullness.qual.Nullable;
 import org.checkerframework.dataflow.qual.Pure;
@@ -93,10 +93,16 @@ static org.apache.iceberg.Schema resolveSchema(
     if (keep != null && !keep.isEmpty()) {
       selectedFieldsBuilder.addAll(keep);
     } else if (drop != null && !drop.isEmpty()) {
-      Set<String> fields =
-          schema.columns().stream().map(Types.NestedField::name).collect(Collectors.toSet());
-      drop.forEach(fields::remove);
-      selectedFieldsBuilder.addAll(fields);
+      List<String> paths = new ArrayList<>(TypeUtil.indexNameById(schema.asStruct()).values());
+      Collections.sort(paths);
+      for (int i = 0; i < paths.size(); i++) {
+        String path = paths.get(i);
+        boolean isParent = i + 1 < paths.size() && paths.get(i + 1).startsWith(path + ".");
+        boolean isDrop = drop.stream().anyMatch(d -> path.equals(d) || path.startsWith(d + "."));
+        if (!isParent && !isDrop) {
+          selectedFieldsBuilder.add(path);
+        }
+      }
     } else {
       // default: include all columns
       return schema;
@@ -327,7 +333,7 @@ void validate(Table table) {
         param = "drop";
         fieldsSpecified = newHashSet(checkNotNull(drop));
       }
-      table.schema().columns().forEach(nf -> fieldsSpecified.remove(nf.name()));
+      fieldsSpecified.removeIf(name -> table.schema().findField(name) != null);
 
       checkArgument(
           fieldsSpecified.isEmpty(),
diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java
@@ -273,6 +273,36 @@ public void testProjectedSchema() {
     assertTrue(projectKeep.sameSchema(expectedKeep));
   }
 
+  @Test
+  public void testProjectedSchemaWithNestedFields() {
+    org.apache.iceberg.Schema schema =
+        new org.apache.iceberg.Schema(
+            required(1, "id", StringType.get()),
+            required(
+                2,
+                "data",
+                StructType.of(
+                    required(3, "name", StringType.get()), required(4, "value", StringType.get()))),
+            required(5, "metadata", StringType.get()));
+
+    // test nested keep
+    org.apache.iceberg.Schema projectKeep = resolveSchema(schema, asList("id", "data.name"), null);
+    org.apache.iceberg.Schema expectedKeep =
+        new org.apache.iceberg.Schema(
+            required(1, "id", StringType.get()),
+            required(2, "data", StructType.of(required(3, "name", StringType.get()))));
+    assertTrue(projectKeep.sameSchema(expectedKeep));
+
+    // test nested drop
+    org.apache.iceberg.Schema projectDrop = resolveSchema(schema, null, asList("data.name"));
+    org.apache.iceberg.Schema expectedDrop =
+        new org.apache.iceberg.Schema(
+            required(1, "id", StringType.get()),
+            required(2, "data", StructType.of(required(4, "value", StringType.get()))),
+            required(5, "metadata", StringType.get()));
+    assertTrue(projectDrop.sameSchema(expectedDrop));
+  }
+
   @Test
   public void testSimpleScan() throws Exception {
     TableIdentifier tableId =