[llvm-branch-commits] [clang] [CIR] Implement Direct+canFlatten in CallConvLowering (PR #201719)

Fri Jun 12 08:20:33 PDT 2026

https://github.com/adams381 updated https://github.com/llvm/llvm-project/pull/201719

>From 9c1f9bb4c3d768bd3fbb1d0607ae7d6587c5304d Mon Sep 17 00:00:00 2001
From: Adam Smith <adams at nvidia.com>
Date: Thu, 4 Jun 2026 16:22:32 -0700
Subject: [PATCH 1/2] [CIR] Implement Direct+canFlatten in CallConvLowering

CallConvLowering previously ignored the canFlatten flag on Direct
classifications: a Direct arg with a multi-field struct coerced type was
passed as a single struct argument rather than N scalar register arguments.
This is the register-passing pattern the x86-64 SysV ABI uses for structs
like struct { long a, b; }.

A new helper getFlattenedCoercedType centralizes the detection (Direct,
multi-field struct coercedType, canFlatten set).  The three lowering sites
are updated: buildNewArgTypes pushes one wire type per field; insertArgCoercion
reassembles the coerced struct from N scalar block args then coerces to the
original type if the two differ; rewriteCallSite extracts each field via
cir.extract_member.  The existing coerce-record-to-record-via-memory.cir
test gains can_flatten = false to opt into the single-arg path.
---
 .../TargetLowering/CIRABIRewriteContext.cpp   | 150 ++++++++++++--
 .../coerce-record-to-record-via-memory.cir    |   2 +-
 .../abi-lowering/direct-flatten.cir           | 190 ++++++++++++++++++
 3 files changed, 325 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir

diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp
index 6f45d619f4cb9..113d68ef00e3e 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp
@@ -30,6 +30,13 @@ using namespace mlir::abi;
 // into the struct via an alloca+get_member+store+load sequence.  At the
 // call site, the struct operand is decomposed into its fields using
 // cir.extract_member.
+//
+// For Direct + canFlatten (where the coerced type is a multi-field struct),
+// the coerced struct is similarly flattened into N individual wire arguments.
+// The callee reassembles the N scalar block args into the coerced struct,
+// then coerces to the original argument type if the two types differ.  The
+// call site coerces the original type to the coerced struct, then extracts
+// each field as a separate call argument.
 
 namespace {
 
@@ -46,6 +53,26 @@ bool needsRewrite(const FunctionClassification &fc) {
   return false;
 }
 
+/// Return the coerced RecordType for a Direct classification that should be
+/// flattened into individual scalar arguments, or a null type if the
+/// classification does not call for flattening.
+///
+/// Flattening applies when all four conditions hold:
+///   1. The classification is Direct with a non-null coercedType.
+///   2. canFlatten is set.
+///   3. The coercedType is a struct (not a union).
+///   4. The struct has more than one field (single-field structs are already
+///      scalar; flattening them produces no benefit and classic CodeGen skips
+///      them for the same reason).
+cir::RecordType getFlattenedCoercedType(const ArgClassification &ac) {
+  if (ac.kind != ArgKind::Direct || !ac.coercedType || !ac.canFlatten)
+    return {};
+  auto recTy = dyn_cast<cir::RecordType>(ac.coercedType);
+  if (!recTy || !recTy.isStruct() || recTy.getNumElements() <= 1)
+    return {};
+  return recTy;
+}
+
 /// Build the new argument-type list for a function whose ABI classification
 /// is \p fc.  Handles Direct (with or without coercion), Extend, Ignore,
 /// Indirect (byval and byref), and Expand (struct flattening) arguments.
@@ -61,11 +88,20 @@ LogicalResult buildNewArgTypes(ArrayRef<Type> oldArgTypes,
     Type origTy = oldArgTypes[idx];
     switch (ac.kind) {
     case ArgKind::Direct:
-      // Direct with a coerced type means the wire signature uses the
-      // coerced type; the body still expects origTy and we'll insert a
-      // coercion at the entry block.  Direct without a coerced type is a
-      // true pass-through.
-      newArgTypes.push_back(ac.coercedType ? ac.coercedType : origTy);
+      // Direct with canFlatten and a struct coerced type: push one wire type
+      // per field of the coerced struct rather than the struct itself.
+      // Single-field coerced structs fall through to the non-flatten path —
+      // the struct is already scalar-sized and flattening adds no value.
+      if (auto flatTy = getFlattenedCoercedType(ac)) {
+        for (Type memberTy : flatTy.getMembers())
+          newArgTypes.push_back(memberTy);
+      } else {
+        // Direct with a coerced type: the wire signature uses the coerced
+        // type; the body still expects origTy and insertArgCoercion recovers
+        // it via a memory round-trip.  Direct without coercion is a
+        // pass-through.
+        newArgTypes.push_back(ac.coercedType ? ac.coercedType : origTy);
+      }
       break;
     case ArgKind::Ignore:
       break;
@@ -161,9 +197,12 @@ ArrayAttr updateArgAttrs(MLIRContext *ctx, ArrayRef<Type> origArgTypes,
     DictionaryAttr existing = DictionaryAttr::get(ctx);
     if (existingArgAttrs && oldIdx < existingArgAttrs.size())
       existing = cast<DictionaryAttr>(existingArgAttrs[oldIdx]);
-    if (ac.kind == ArgKind::Expand) {
-      // Push one empty attribute dict per expanded field; the flattened
-      // scalar arguments carry no special ABI attributes.
+    if (auto flatTy = getFlattenedCoercedType(ac)) {
+      // Direct + canFlatten: one empty dict per flattened field.
+      for (unsigned i = 0; i < flatTy.getNumElements(); ++i)
+        newArgAttrs.push_back(DictionaryAttr::get(ctx));
+    } else if (ac.kind == ArgKind::Expand) {
+      // Pure Expand: one empty dict per struct field.
       auto recTy = cast<cir::RecordType>(origArgTypes[oldIdx]);
       for (unsigned i = 0; i < recTy.getNumElements(); ++i)
         newArgAttrs.push_back(DictionaryAttr::get(ctx));
@@ -419,6 +458,63 @@ void insertArgCoercion(FunctionOpInterface funcOp,
 
     BlockArgument blockArg = entry.getArgument(blockArgIdx);
 
+    if (auto flatTy = getFlattenedCoercedType(ac)) {
+      // Direct + canFlatten: the coerced type is a struct whose fields become
+      // individual wire arguments.  The reconstruction mirrors the Expand path
+      // — replace the single block arg with N scalar block args, store them
+      // into an alloca of the coerced struct type, reload — but then applies
+      // an additional coercion from the coerced struct type to the original
+      // argument type if the two differ in layout.
+      unsigned numFields = flatTy.getNumElements();
+      assert(numFields >= 2 && "getFlattenedCoercedType guarantees >1 fields");
+      Type origTy = blockArg.getType();
+      Location loc = funcOp.getLoc();
+
+      // Change slot 0 to field 0's type; insert slots 1..N-1 after it.
+      blockArg.setType(flatTy.getElementType(0));
+      for (unsigned f = 1; f < numFields; ++f)
+        entry.insertArgument(blockArgIdx + f, flatTy.getElementType(f), loc);
+
+      // setInsertionPointToStart: see comment in the Expand arm above.
+      rewriter.setInsertionPointToStart(&entry);
+      auto flatPtrTy = cir::PointerType::get(flatTy);
+      uint64_t flatAlign = dl.getTypeABIAlignment(flatTy);
+      auto flatSlot = cir::AllocaOp::create(
+          rewriter, loc, flatPtrTy, flatTy, rewriter.getStringAttr("coerce"),
+          rewriter.getI64IntegerAttr(flatAlign));
+      SmallPtrSet<Operation *, 8> flattenOps = {flatSlot};
+      for (unsigned f = 0; f < numFields; ++f) {
+        Type fieldPtrTy = cir::PointerType::get(flatTy.getElementType(f));
+        auto fieldPtr = cir::GetMemberOp::create(rewriter, loc, fieldPtrTy,
+                                                 flatSlot, /*name=*/"",
+                                                 /*index=*/f);
+        flattenOps.insert(fieldPtr);
+        auto storeOp = cir::StoreOp::create(
+            rewriter, loc, entry.getArgument(blockArgIdx + f), fieldPtr);
+        flattenOps.insert(storeOp);
+      }
+      auto flatLoaded =
+          cir::LoadOp::create(rewriter, loc, flatTy, flatSlot.getResult());
+      flattenOps.insert(flatLoaded);
+
+      // If the coerced struct type differs from the original argument type,
+      // insert a memory round-trip to recover the original type for body uses.
+      Value finalVal = flatLoaded;
+      if (origTy != flatTy) {
+        SmallPtrSet<Operation *, 4> coercionOps;
+        finalVal = emitCoercion(rewriter, loc, origTy, flatLoaded, funcOp, dl,
+                                coercionOps);
+        flattenOps.insert(coercionOps.begin(), coercionOps.end());
+      }
+
+      // Replace all original body uses of the struct block arg (now field 0)
+      // with the recovered original-type value.
+      blockArg.replaceAllUsesExcept(finalVal, flattenOps);
+
+      blockArgIdx += numFields;
+      continue;
+    }
+
     if (ac.kind == ArgKind::Direct && ac.coercedType) {
       Type oldArgTy = blockArg.getType();
       Type newArgTy = ac.coercedType;
@@ -656,7 +752,10 @@ LogicalResult CIRABIRewriteContext::rewriteFunctionDefinition(
         unsigned runningIdx = sretOffset;
         for (unsigned i = 0; i < fc.argInfos.size(); ++i) {
           classToBlockArg[i] = runningIdx;
-          if (fc.argInfos[i].kind == ArgKind::Expand) {
+          if (auto flatTy = getFlattenedCoercedType(fc.argInfos[i])) {
+            // Direct + canFlatten: N slots, one per coerced struct field.
+            runningIdx += flatTy.getNumElements();
+          } else if (fc.argInfos[i].kind == ArgKind::Expand) {
             auto recTy = cast<cir::RecordType>(oldArgTypes[i]);
             runningIdx += recTy.getNumElements();
           } else {
@@ -717,11 +816,13 @@ LogicalResult CIRABIRewriteContext::rewriteFunctionDefinition(
   // Rebuild arg_attrs when the function has an sret slot (slot 0 needs the
   // sret attribute set) or any arg is Ignore (dropped from the output array),
   // Extend (needs llvm.signext / llvm.zeroext), Indirect (needs
-  // llvm.byval / llvm.align), or Expand (changes the argument count).
+  // llvm.byval / llvm.align), Expand or Direct+canFlatten (both change the
+  // argument count).
   bool needsArgAttrUpdate =
       hasSRet || llvm::any_of(fc.argInfos, [](const ArgClassification &ac) {
         return ac.kind == ArgKind::Ignore || ac.kind == ArgKind::Extend ||
-               ac.kind == ArgKind::Indirect || ac.kind == ArgKind::Expand;
+               ac.kind == ArgKind::Indirect || ac.kind == ArgKind::Expand ||
+               getFlattenedCoercedType(ac);
       });
   if (needsArgAttrUpdate) {
     auto existing = funcOp->getAttrOfType<ArrayAttr>("arg_attrs");
@@ -792,7 +893,21 @@ LogicalResult CIRABIRewriteContext::rewriteCallSite(
     if (ac.kind == ArgKind::Ignore)
       continue;
     Value arg = argOperands[idx];
-    if (ac.kind == ArgKind::Expand) {
+    if (auto flatTy = getFlattenedCoercedType(ac)) {
+      // Direct + canFlatten: coerce the struct to the ABI-coerced struct type
+      // and then extract each field as a separate call argument.  The coercion
+      // is a memory round-trip when the original and coerced types differ in
+      // layout; when they are the same CIR type the coercion is skipped.
+      Value coerced = arg;
+      if (arg.getType() != flatTy)
+        coerced = emitCoercion(builder, call.getLoc(), flatTy, arg,
+                               enclosingFunc, dl);
+      for (unsigned f = 0; f < flatTy.getNumElements(); ++f) {
+        Value field =
+            cir::ExtractMemberOp::create(builder, call.getLoc(), coerced, f);
+        newArgs.push_back(field);
+      }
+    } else if (ac.kind == ArgKind::Expand) {
       // Decompose the struct value into its constituent scalar fields and
       // pass each as a separate argument.  cir.extract_member extracts the
       // field value directly without a memory round-trip.
@@ -887,12 +1002,14 @@ LogicalResult CIRABIRewriteContext::rewriteCallSite(
 
     // Shape the per-argument attrs exactly as the non-sret path does
     // (signext / zeroext for Extend, drop Ignore slots, byval / align for
-    // Indirect, flatten for Expand) before prepending the sret slot.
+    // Indirect, flatten for Expand and Direct+canFlatten) before prepending
+    // the sret slot.
     ArrayAttr argAttrs = call->getAttrOfType<ArrayAttr>("arg_attrs");
     bool needsArgAttrUpdate =
         llvm::any_of(fc.argInfos, [](const ArgClassification &ac) {
           return ac.kind == ArgKind::Ignore || ac.kind == ArgKind::Extend ||
-                 ac.kind == ArgKind::Indirect || ac.kind == ArgKind::Expand;
+                 ac.kind == ArgKind::Indirect || ac.kind == ArgKind::Expand ||
+                 getFlattenedCoercedType(ac);
         });
     if (needsArgAttrUpdate)
       argAttrs = updateArgAttrs(ctx, origCallArgTypes, argAttrs, fc);
@@ -946,11 +1063,12 @@ LogicalResult CIRABIRewriteContext::rewriteCallSite(
   // Layer llvm.signext / llvm.zeroext onto the new call's arg_attrs and
   // res_attrs for Extend args/return.  Ignore args require a rebuild because
   // their slots are dropped; Indirect args need llvm.byval / llvm.align;
-  // Expand args change the argument count.
+  // Expand and Direct+canFlatten args change the argument count.
   bool needsArgAttrUpdate =
       llvm::any_of(fc.argInfos, [](const ArgClassification &ac) {
         return ac.kind == ArgKind::Ignore || ac.kind == ArgKind::Extend ||
-               ac.kind == ArgKind::Indirect || ac.kind == ArgKind::Expand;
+               ac.kind == ArgKind::Indirect || ac.kind == ArgKind::Expand ||
+               getFlattenedCoercedType(ac);
       });
   if (needsArgAttrUpdate) {
     auto existing = call->getAttrOfType<ArrayAttr>("arg_attrs");
diff --git a/clang/test/CIR/Transforms/abi-lowering/coerce-record-to-record-via-memory.cir b/clang/test/CIR/Transforms/abi-lowering/coerce-record-to-record-via-memory.cir
index 62ea2378623fd..27b06ee95e0c5 100644
--- a/clang/test/CIR/Transforms/abi-lowering/coerce-record-to-record-via-memory.cir
+++ b/clang/test/CIR/Transforms/abi-lowering/coerce-record-to-record-via-memory.cir
@@ -8,7 +8,7 @@
 
 #coerce_vec4_to_twoi64 = {
   return = { kind = "direct" },
-  args   = [ { kind = "direct", coerced_type = !rec_TwoI64 } ]
+  args   = [ { kind = "direct", coerced_type = !rec_TwoI64, can_flatten = false } ]
 }
 
 #caller_no_args = {
diff --git a/clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir b/clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir
new file mode 100644
index 0000000000000..97559715e9173
--- /dev/null
+++ b/clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir
@@ -0,0 +1,190 @@
+// RUN: cir-opt %s -cir-call-conv-lowering="classification-attr=test_classify" \
+// RUN:   | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+!s64i = !cir.int<s, 64>
+
+// Named source structs.
+!rec_TwoLong  = !cir.struct<"TwoLong"  {!s64i, !s64i}>
+!rec_ThreeInt = !cir.struct<"ThreeInt" {!s32i, !s32i, !s32i}>
+// dim3-equivalent: three i32 fields totalling 12 bytes.
+!rec_Dim3     = !cir.struct<"Dim3"     {!s32i, !s32i, !s32i}>
+
+// ABI-coerced anonymous structs.  MLIR auto-generates their aliases
+// (!rec_anon_struct, !rec_anon_struct1, etc.) in encounter order.
+!rec_coerced_two   = !cir.struct<{!s64i, !s64i}>
+!rec_coerced_three = !cir.struct<{!s32i, !s32i, !s32i}>
+// x86-64 SysV packs dim3's first two i32s into an i64; the third i32
+// is a separate half of the second eightbyte.
+!rec_coerced_dim3  = !cir.struct<{!s64i, !s32i}>
+
+#flatten_two = {
+  return = { kind = "direct" },
+  args   = [ { kind = "direct", coerced_type = !rec_coerced_two,
+               can_flatten = true } ]
+}
+
+#flatten_three = {
+  return = { kind = "direct" },
+  args   = [ { kind = "direct", coerced_type = !rec_coerced_three,
+               can_flatten = true } ]
+}
+
+#flatten_dim3 = {
+  return = { kind = "direct" },
+  args   = [ { kind = "direct", coerced_type = !rec_coerced_dim3,
+               can_flatten = true } ]
+}
+
+#flatten_ignore_flatten = {
+  return = { kind = "direct" },
+  args   = [ { kind = "direct", coerced_type = !rec_coerced_two,
+               can_flatten = true },
+             { kind = "ignore" },
+             { kind = "direct", coerced_type = !rec_coerced_two,
+               can_flatten = true } ]
+}
+
+#passthrough = {
+  return = { kind = "direct" },
+  args   = [ ]
+}
+
+module attributes {
+  dlti.dl_spec = #dlti.dl_spec<
+    #dlti.dl_entry<i32, dense<32>: vector<2xi64>>,
+    #dlti.dl_entry<i64, dense<64>: vector<2xi64>>>
+} {
+
+// Callee: the single TwoLong arg is replaced by two i64 block args.  At entry
+// the scalars are stored into a coerced-struct alloca, then coerced back to
+// TwoLong for body uses.
+
+cir.func @takes_two_long(%arg0: !rec_TwoLong)
+    attributes { test_classify = #flatten_two } {
+  %0 = cir.alloca !rec_TwoLong, !cir.ptr<!rec_TwoLong>, ["p"] {alignment = 8 : i64}
+  cir.store %arg0, %0 : !rec_TwoLong, !cir.ptr<!rec_TwoLong>
+  cir.return
+}
+
+// CHECK:      cir.func{{.*}} @takes_two_long(%[[F0:.*]]: !s64i, %[[F1:.*]]: !s64i)
+// Two ["coerce"] allocas: the coercion round-trip alloca and the flatSlot.
+// CHECK:        %{{.*}} = cir.alloca {{.*}} ["coerce"]
+// CHECK:        %[[FSLOT:.*]] = cir.alloca {{.*}} ["coerce"]
+// Store each field block arg into the flat slot.
+// CHECK:        %[[P0:.*]] = cir.get_member %[[FSLOT]][0]
+// CHECK:        cir.store %[[F0]], %[[P0]] : !s64i, !cir.ptr<!s64i>
+// CHECK:        %[[P1:.*]] = cir.get_member %[[FSLOT]][1]
+// CHECK:        cir.store %[[F1]], %[[P1]] : !s64i, !cir.ptr<!s64i>
+// Reload the coerced struct, coerce to TwoLong, use in body.
+// CHECK:        %{{.*}} = cir.load %[[FSLOT]]
+// CHECK:        cir.cast bitcast %{{.*}} : !cir.ptr<{{.*}}> -> !cir.ptr<!rec_TwoLong>
+// CHECK:        %[[TVAL:.*]] = cir.load %{{.*}} : !cir.ptr<!rec_TwoLong>, !rec_TwoLong
+// CHECK:        %[[P:.*]] = cir.alloca !rec_TwoLong, !cir.ptr<!rec_TwoLong>, ["p"]
+// CHECK:        cir.store %[[TVAL]], %[[P]] : !rec_TwoLong, !cir.ptr<!rec_TwoLong>
+
+// Forward declaration: two i64 wire args, no body.
+
+cir.func private @takes_two_long_decl(!rec_TwoLong)
+    attributes { test_classify = #flatten_two }
+
+// CHECK:      cir.func{{.*}} @takes_two_long_decl(!s64i, !s64i)
+// CHECK-NOT:  !rec_TwoLong
+
+// Caller: the TwoLong operand is coerced to the anonymous struct type via a
+// memory round-trip, then each field is extracted as a separate call argument.
+
+cir.func @caller_two_long() attributes { test_classify = #passthrough } {
+  %0 = cir.alloca !rec_TwoLong, !cir.ptr<!rec_TwoLong>, ["s"] {alignment = 8 : i64}
+  %1 = cir.load %0 : !cir.ptr<!rec_TwoLong>, !rec_TwoLong
+  cir.call @takes_two_long(%1) : (!rec_TwoLong) -> ()
+  cir.return
+}
+
+// CHECK:      cir.func{{.*}} @caller_two_long()
+// CHECK:        %[[VAL:.*]] = cir.load %{{.*}} : !cir.ptr<!rec_TwoLong>, !rec_TwoLong
+// Coerce TwoLong → coerced struct then extract fields.
+// CHECK:        %[[CV:.*]] = cir.load %{{.*}} : !cir.ptr<{{.*}}>, {{.*}}
+// CHECK:        %[[F0:.*]] = cir.extract_member %[[CV]][0] : {{.*}} -> !s64i
+// CHECK:        %[[F1:.*]] = cir.extract_member %[[CV]][1] : {{.*}} -> !s64i
+// CHECK:        cir.call @takes_two_long(%[[F0]], %[[F1]]) : (!s64i, !s64i) -> ()
+
+// Three-field flatten (CUDA dim3 with 3 equal-width fields → 3 scalar args).
+// Covers the numFields == 3 path in insertArgCoercion.
+
+cir.func @takes_three_int(%arg0: !rec_ThreeInt)
+    attributes { test_classify = #flatten_three } {
+  cir.return
+}
+
+// CHECK:      cir.func{{.*}} @takes_three_int(%[[A:.*]]: !s32i, %[[B:.*]]: !s32i, %[[C:.*]]: !s32i)
+// Two ["coerce"] allocas appear; the fields are stored via the second one.
+// CHECK:        cir.alloca {{.*}} ["coerce"]
+// CHECK:        cir.alloca {{.*}} ["coerce"]
+// CHECK:        %[[PA:.*]] = cir.get_member %{{.*}}[0]
+// CHECK:        cir.store %[[A]], %[[PA]] : !s32i, !cir.ptr<!s32i>
+// CHECK:        %[[PB:.*]] = cir.get_member %{{.*}}[1]
+// CHECK:        cir.store %[[B]], %[[PB]] : !s32i, !cir.ptr<!s32i>
+// CHECK:        %[[PC:.*]] = cir.get_member %{{.*}}[2]
+// CHECK:        cir.store %[[C]], %[[PC]] : !s32i, !cir.ptr<!s32i>
+
+cir.func @caller_three_int() attributes { test_classify = #passthrough } {
+  %0 = cir.alloca !rec_ThreeInt, !cir.ptr<!rec_ThreeInt>, ["s"] {alignment = 4 : i64}
+  %1 = cir.load %0 : !cir.ptr<!rec_ThreeInt>, !rec_ThreeInt
+  cir.call @takes_three_int(%1) : (!rec_ThreeInt) -> ()
+  cir.return
+}
+
+// CHECK:      cir.func{{.*}} @caller_three_int()
+// CHECK:        %[[F0:.*]] = cir.extract_member %{{.*}}[0] : {{.*}} -> !s32i
+// CHECK:        %[[F1:.*]] = cir.extract_member %{{.*}}[1] : {{.*}} -> !s32i
+// CHECK:        %[[F2:.*]] = cir.extract_member %{{.*}}[2] : {{.*}} -> !s32i
+// CHECK:        cir.call @takes_three_int(%[[F0]], %[[F1]], %[[F2]]) : (!s32i, !s32i, !s32i) -> ()
+
+// Mixed-packing case (x86-64 SysV dim3 pattern): struct { uint x, y, z; }
+// is coerced to {i64, i32} — the first two i32s are packed into an i64,
+// producing 2 wire arguments from a 3-field source struct.
+
+cir.func @takes_dim3(%arg0: !rec_Dim3)
+    attributes { test_classify = #flatten_dim3 } {
+  cir.return
+}
+
+// CHECK:      cir.func{{.*}} @takes_dim3(%[[D0:.*]]: !s64i, %[[D1:.*]]: !s32i)
+// Two ["coerce"] allocas appear; the {i64, i32} fields are stored into the second.
+// CHECK:        cir.alloca {{.*}} ["coerce"]
+// CHECK:        cir.alloca {{.*}} ["coerce"]
+// CHECK:        %{{.*}} = cir.get_member %{{.*}}[0]
+// CHECK:        cir.store %[[D0]], %{{.*}} : !s64i
+// CHECK:        %{{.*}} = cir.get_member %{{.*}}[1]
+// CHECK:        cir.store %[[D1]], %{{.*}} : !s32i
+
+cir.func @caller_dim3() attributes { test_classify = #passthrough } {
+  %0 = cir.alloca !rec_Dim3, !cir.ptr<!rec_Dim3>, ["d"] {alignment = 4 : i64}
+  %1 = cir.load %0 : !cir.ptr<!rec_Dim3>, !rec_Dim3
+  cir.call @takes_dim3(%1) : (!rec_Dim3) -> ()
+  cir.return
+}
+
+// CHECK:      cir.func{{.*}} @caller_dim3()
+// Coerce Dim3 → {i64, i32} then extract two fields.
+// CHECK:        %[[E0:.*]] = cir.extract_member %{{.*}}[0] : {{.*}} -> !s64i
+// CHECK:        %[[E1:.*]] = cir.extract_member %{{.*}}[1] : {{.*}} -> !s32i
+// CHECK:        cir.call @takes_dim3(%[[E0]], %[[E1]]) : (!s64i, !s32i) -> ()
+
+// Two flatten args with an Ignore in between: exercises the Ignore-drop loop's
+// classToBlockArg mapping when canFlatten args occupy multiple block arg slots.
+
+cir.func @flatten_ignore_flatten_callee(
+    %a: !rec_TwoLong, %b: !rec_TwoLong, %c: !rec_TwoLong)
+    attributes { test_classify = #flatten_ignore_flatten } {
+  cir.return
+}
+
+// CHECK:      cir.func{{.*}} @flatten_ignore_flatten_callee(
+// CHECK-SAME:     %{{.*}}: !s64i, %{{.*}}: !s64i, %{{.*}}: !s64i, %{{.*}}: !s64i)
+// The middle ignored arg is dropped; two flatten slots produce four block args.
+// CHECK-DAG:    cir.alloca {{.*}} ["coerce"]
+// CHECK-DAG:    cir.alloca {{.*}} ["coerce"]
+
+}

>From f92034faad39a8d530970ae8f4aa08ae0aa200b0 Mon Sep 17 00:00:00 2001
From: Adam Smith <adams at nvidia.com>
Date: Fri, 12 Jun 2026 08:18:46 -0700
Subject: [PATCH 2/2] [CIR] Load flattened struct args from coerce slot

At the call site, a struct argument that flattens into scalar wire
arguments was coerced to the ABI struct as a whole value and then
decomposed with cir.extract_member.  When the coercion goes through
memory, read each field from the coerced slot with cir.get_member +
cir.load instead, so the lowering takes pointers to the members it
wants rather than loading the entire structure and extracting from the
value.  The shared memory half of the coercion is factored into
emitCoercionToMemory, which returns the destination-typed pointer to
the coerce slot; emitCoercion now builds on it and loads the whole
value, so its existing callers are unchanged.  The no-coercion call
site (the operand already has the coerced type) keeps cir.extract_member
because that value has no backing slot to take member pointers from.

The remaining changes are mechanical: llvm::append_range and
SmallVector::append for the per-field loops, spelling out cir::RecordType
instead of auto at the getFlattenedCoercedType call sites, an enumerate
loop over the coerced members, and renaming the builder parameter from
rewriter to builder in insertArgCoercion and the emitCoercion overloads.

direct-flatten.cir's caller check groups are updated to the get_member +
load shape, including the packed dim3 case that reads both fields from
the coerced slot.
---
 .../TargetLowering/CIRABIRewriteContext.cpp   | 168 ++++++++++--------
 .../abi-lowering/direct-flatten.cir           |  45 +++--
 2 files changed, 130 insertions(+), 83 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp
index 113d68ef00e3e..ccf80b93ab955 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRABIRewriteContext.cpp
@@ -92,9 +92,8 @@ LogicalResult buildNewArgTypes(ArrayRef<Type> oldArgTypes,
       // per field of the coerced struct rather than the struct itself.
       // Single-field coerced structs fall through to the non-flatten path —
       // the struct is already scalar-sized and flattening adds no value.
-      if (auto flatTy = getFlattenedCoercedType(ac)) {
-        for (Type memberTy : flatTy.getMembers())
-          newArgTypes.push_back(memberTy);
+      if (cir::RecordType flatTy = getFlattenedCoercedType(ac)) {
+        llvm::append_range(newArgTypes, flatTy.getMembers());
       } else {
         // Direct with a coerced type: the wire signature uses the coerced
         // type; the body still expects origTy and insertArgCoercion recovers
@@ -197,10 +196,9 @@ ArrayAttr updateArgAttrs(MLIRContext *ctx, ArrayRef<Type> origArgTypes,
     DictionaryAttr existing = DictionaryAttr::get(ctx);
     if (existingArgAttrs && oldIdx < existingArgAttrs.size())
       existing = cast<DictionaryAttr>(existingArgAttrs[oldIdx]);
-    if (auto flatTy = getFlattenedCoercedType(ac)) {
+    if (cir::RecordType flatTy = getFlattenedCoercedType(ac)) {
       // Direct + canFlatten: one empty dict per flattened field.
-      for (unsigned i = 0; i < flatTy.getNumElements(); ++i)
-        newArgAttrs.push_back(DictionaryAttr::get(ctx));
+      newArgAttrs.append(flatTy.getNumElements(), DictionaryAttr::get(ctx));
     } else if (ac.kind == ArgKind::Expand) {
       // Pure Expand: one empty dict per struct field.
       auto recTy = cast<cir::RecordType>(origArgTypes[oldIdx]);
@@ -273,18 +271,22 @@ ArrayAttr updateResAttrs(MLIRContext *ctx, ArrayAttr existingResAttrs,
   return ArrayAttr::get(ctx, {DictionaryAttr::get(ctx, attrs)});
 }
 
-/// Coerce \p src to type \p dstTy at the current builder insertion point by
-/// going through memory: allocate a slot, store the source, then load the
-/// destination type back out.  Lowers uniformly for scalar, vector, and
-/// record types.
+/// Coerce \p src into a temporary memory slot typed for \p dstTy at the
+/// current builder insertion point, and return the destination-typed pointer
+/// to that slot without loading the value back out.  This is the shared
+/// memory half of emitCoercion: callers that want the whole coerced value use
+/// emitCoercion (below); callers that want to read individual members of a
+/// coerced struct (the call-site struct flattening) take the returned pointer
+/// and emit their own cir.get_member + cir.load per field.  Lowers uniformly
+/// for scalar, vector, and record types.
 ///
-/// The slot is sized to the larger of the two types so that neither the
-/// store nor the load ever runs past it: the coerced ABI type can be larger
-/// than the original (e.g. a 12-byte aggregate returned as `{i64, i64}`), so
-/// loading the destination out of a source-sized slot would over-read.
+/// The slot is sized to the larger of the two types so that neither the store
+/// nor a later load ever runs past it: the coerced ABI type can be larger
+/// than the original (e.g. a 12-byte aggregate passed as `{i64, i64}`), so
+/// accessing the destination through a source-sized slot would over-read.
 /// Alignment is max(srcAlign, dstAlign) to satisfy both accesses.  The slot
-/// is accessed through a source-typed view for the store and a
-/// destination-typed view for the load.
+/// is written through a source-typed view and returned as a destination-typed
+/// view.
 ///
 /// The temporary alloca is placed at the start of the enclosing function's
 /// entry block so that it composes correctly with the HoistAllocas pass
@@ -293,9 +295,10 @@ ArrayAttr updateResAttrs(MLIRContext *ctx, ArrayAttr existingResAttrs,
 /// Any operations the helper creates are appended to \p createdOps so the
 /// caller can pass them to replaceAllUsesExcept and avoid clobbering the
 /// store's value operand when later rewiring the source value.
-Value emitCoercion(OpBuilder &rewriter, Location loc, Type dstTy, Value src,
-                   FunctionOpInterface funcOp, const DataLayout &dl,
-                   SmallPtrSetImpl<Operation *> &createdOps) {
+Value emitCoercionToMemory(OpBuilder &builder, Location loc, Type dstTy,
+                           Value src, FunctionOpInterface funcOp,
+                           const DataLayout &dl,
+                           SmallPtrSetImpl<Operation *> &createdOps) {
   Type srcTy = src.getType();
   assert(srcTy != dstTy &&
          "emitCoercion callers must pre-check that the types differ");
@@ -311,45 +314,55 @@ Value emitCoercion(OpBuilder &rewriter, Location loc, Type dstTy, Value src,
 
   cir::AllocaOp alloca;
   {
-    OpBuilder::InsertionGuard guard(rewriter);
+    OpBuilder::InsertionGuard guard(builder);
     Block &entry = funcOp->getRegion(0).front();
-    rewriter.setInsertionPointToStart(&entry);
-    alloca = cir::AllocaOp::create(rewriter, loc, slotPtrTy, slotTy,
-                                   rewriter.getStringAttr("coerce"),
-                                   rewriter.getI64IntegerAttr(allocaAlign));
+    builder.setInsertionPointToStart(&entry);
+    alloca = cir::AllocaOp::create(builder, loc, slotPtrTy, slotTy,
+                                   builder.getStringAttr("coerce"),
+                                   builder.getI64IntegerAttr(allocaAlign));
   }
   createdOps.insert(alloca);
 
   // Store through a source-typed view of the slot.
   Value srcSlot = alloca;
   if (slotTy != srcTy) {
-    auto srcCast = cir::CastOp::create(rewriter, loc, srcPtrTy,
+    auto srcCast = cir::CastOp::create(builder, loc, srcPtrTy,
                                        cir::CastKind::bitcast, alloca);
     createdOps.insert(srcCast);
     srcSlot = srcCast;
   }
-  auto store = cir::StoreOp::create(rewriter, loc, src, srcSlot);
+  auto store = cir::StoreOp::create(builder, loc, src, srcSlot);
   createdOps.insert(store);
 
-  // Load through a destination-typed view of the slot.
-  Value dstSlot = alloca;
+  // Return a destination-typed view of the slot.
   if (slotTy != dstTy) {
-    auto dstCast = cir::CastOp::create(rewriter, loc, dstPtrTy,
+    auto dstCast = cir::CastOp::create(builder, loc, dstPtrTy,
                                        cir::CastKind::bitcast, alloca);
     createdOps.insert(dstCast);
-    dstSlot = dstCast;
+    return dstCast;
   }
-  auto load = cir::LoadOp::create(rewriter, loc, dstSlot);
+  return alloca;
+}
+
+/// Coerce \p src to type \p dstTy by going through memory and load the whole
+/// coerced value back out.  Builds on emitCoercionToMemory, adding the final
+/// load of the destination-typed view.
+Value emitCoercion(OpBuilder &builder, Location loc, Type dstTy, Value src,
+                   FunctionOpInterface funcOp, const DataLayout &dl,
+                   SmallPtrSetImpl<Operation *> &createdOps) {
+  Value dstSlot =
+      emitCoercionToMemory(builder, loc, dstTy, src, funcOp, dl, createdOps);
+  auto load = cir::LoadOp::create(builder, loc, dstSlot);
   createdOps.insert(load);
   return load;
 }
 
 /// Convenience overload for callers that don't need the createdOps set
 /// (e.g. call-site coercion where we don't replaceAllUsesExcept).
-Value emitCoercion(OpBuilder &rewriter, Location loc, Type dstTy, Value src,
+Value emitCoercion(OpBuilder &builder, Location loc, Type dstTy, Value src,
                    FunctionOpInterface funcOp, const DataLayout &dl) {
   SmallPtrSet<Operation *, 4> ignored;
-  return emitCoercion(rewriter, loc, dstTy, src, funcOp, dl, ignored);
+  return emitCoercion(builder, loc, dstTy, src, funcOp, dl, ignored);
 }
 
 /// Insert coercion before each cir.return so the returned value matches the
@@ -386,7 +399,7 @@ void insertReturnCoercion(FunctionOpInterface funcOp, Type origRetTy,
 /// block argument count, so a running index tracks the current block argument
 /// position rather than computing \p classIdx + \p sretOffset directly.
 void insertArgCoercion(FunctionOpInterface funcOp,
-                       const FunctionClassification &fc, OpBuilder &rewriter,
+                       const FunctionClassification &fc, OpBuilder &builder,
                        const DataLayout &dl, unsigned sretOffset) {
   Region &body = funcOp->getRegion(0);
   if (body.empty())
@@ -427,24 +440,24 @@ void insertArgCoercion(FunctionOpInterface funcOp,
       // the emission order relative to the classification order.  The SSA
       // subgraphs are fully independent — each alloca is written through
       // its own field block args — so the inverted ordering is safe.
-      rewriter.setInsertionPointToStart(&entry);
+      builder.setInsertionPointToStart(&entry);
       auto ptrTy = cir::PointerType::get(recTy);
       uint64_t align = dl.getTypeABIAlignment(recTy);
-      auto slot = cir::AllocaOp::create(rewriter, loc, ptrTy, recTy,
-                                        rewriter.getStringAttr("expand"),
-                                        rewriter.getI64IntegerAttr(align));
+      auto slot = cir::AllocaOp::create(builder, loc, ptrTy, recTy,
+                                        builder.getStringAttr("expand"),
+                                        builder.getI64IntegerAttr(align));
       SmallPtrSet<Operation *, 8> expandOps = {slot};
       for (unsigned f = 0; f < numFields; ++f) {
         Type fieldPtrTy = cir::PointerType::get(recTy.getElementType(f));
-        auto fieldPtr = cir::GetMemberOp::create(rewriter, loc, fieldPtrTy,
-                                                 slot, /*name=*/"",
+        auto fieldPtr = cir::GetMemberOp::create(builder, loc, fieldPtrTy, slot,
+                                                 /*name=*/"",
                                                  /*index=*/f);
         expandOps.insert(fieldPtr);
         auto storeOp = cir::StoreOp::create(
-            rewriter, loc, entry.getArgument(blockArgIdx + f), fieldPtr);
+            builder, loc, entry.getArgument(blockArgIdx + f), fieldPtr);
         expandOps.insert(storeOp);
       }
-      auto loaded = cir::LoadOp::create(rewriter, loc, recTy, slot.getResult());
+      auto loaded = cir::LoadOp::create(builder, loc, recTy, slot.getResult());
       expandOps.insert(loaded);
 
       // Replace all original body uses of the struct block arg with the
@@ -458,7 +471,7 @@ void insertArgCoercion(FunctionOpInterface funcOp,
 
     BlockArgument blockArg = entry.getArgument(blockArgIdx);
 
-    if (auto flatTy = getFlattenedCoercedType(ac)) {
+    if (cir::RecordType flatTy = getFlattenedCoercedType(ac)) {
       // Direct + canFlatten: the coerced type is a struct whose fields become
       // individual wire arguments.  The reconstruction mirrors the Expand path
       // — replace the single block arg with N scalar block args, store them
@@ -476,25 +489,25 @@ void insertArgCoercion(FunctionOpInterface funcOp,
         entry.insertArgument(blockArgIdx + f, flatTy.getElementType(f), loc);
 
       // setInsertionPointToStart: see comment in the Expand arm above.
-      rewriter.setInsertionPointToStart(&entry);
+      builder.setInsertionPointToStart(&entry);
       auto flatPtrTy = cir::PointerType::get(flatTy);
       uint64_t flatAlign = dl.getTypeABIAlignment(flatTy);
       auto flatSlot = cir::AllocaOp::create(
-          rewriter, loc, flatPtrTy, flatTy, rewriter.getStringAttr("coerce"),
-          rewriter.getI64IntegerAttr(flatAlign));
+          builder, loc, flatPtrTy, flatTy, builder.getStringAttr("coerce"),
+          builder.getI64IntegerAttr(flatAlign));
       SmallPtrSet<Operation *, 8> flattenOps = {flatSlot};
-      for (unsigned f = 0; f < numFields; ++f) {
-        Type fieldPtrTy = cir::PointerType::get(flatTy.getElementType(f));
-        auto fieldPtr = cir::GetMemberOp::create(rewriter, loc, fieldPtrTy,
+      for (auto [f, fieldTy] : llvm::enumerate(flatTy.getMembers())) {
+        Type fieldPtrTy = cir::PointerType::get(fieldTy);
+        auto fieldPtr = cir::GetMemberOp::create(builder, loc, fieldPtrTy,
                                                  flatSlot, /*name=*/"",
                                                  /*index=*/f);
         flattenOps.insert(fieldPtr);
         auto storeOp = cir::StoreOp::create(
-            rewriter, loc, entry.getArgument(blockArgIdx + f), fieldPtr);
+            builder, loc, entry.getArgument(blockArgIdx + f), fieldPtr);
         flattenOps.insert(storeOp);
       }
       auto flatLoaded =
-          cir::LoadOp::create(rewriter, loc, flatTy, flatSlot.getResult());
+          cir::LoadOp::create(builder, loc, flatTy, flatSlot.getResult());
       flattenOps.insert(flatLoaded);
 
       // If the coerced struct type differs from the original argument type,
@@ -502,7 +515,7 @@ void insertArgCoercion(FunctionOpInterface funcOp,
       Value finalVal = flatLoaded;
       if (origTy != flatTy) {
         SmallPtrSet<Operation *, 4> coercionOps;
-        finalVal = emitCoercion(rewriter, loc, origTy, flatLoaded, funcOp, dl,
+        finalVal = emitCoercion(builder, loc, origTy, flatLoaded, funcOp, dl,
                                 coercionOps);
         flattenOps.insert(coercionOps.begin(), coercionOps.end());
       }
@@ -524,10 +537,10 @@ void insertArgCoercion(FunctionOpInterface funcOp,
       }
       blockArg.setType(newArgTy);
 
-      rewriter.setInsertionPointToStart(&entry);
+      builder.setInsertionPointToStart(&entry);
       SmallPtrSet<Operation *, 4> coercionOps;
-      Value adapted = emitCoercion(rewriter, funcOp.getLoc(), oldArgTy,
-                                   blockArg, funcOp, dl, coercionOps);
+      Value adapted = emitCoercion(builder, funcOp.getLoc(), oldArgTy, blockArg,
+                                   funcOp, dl, coercionOps);
 
       // Replace blockArg uses with the adapted value, except inside the
       // helper ops we just created.  This is critical: the StoreOp's
@@ -545,9 +558,9 @@ void insertArgCoercion(FunctionOpInterface funcOp,
       auto ptrTy = cir::PointerType::get(origTy);
       blockArg.setType(ptrTy);
 
-      rewriter.setInsertionPointToStart(&entry);
+      builder.setInsertionPointToStart(&entry);
       auto loadOp =
-          cir::LoadOp::create(rewriter, funcOp.getLoc(), origTy, blockArg);
+          cir::LoadOp::create(builder, funcOp.getLoc(), origTy, blockArg);
       SmallPtrSet<Operation *, 1> loadOps = {loadOp};
       blockArg.replaceAllUsesExcept(loadOp.getResult(), loadOps);
     }
@@ -752,7 +765,8 @@ LogicalResult CIRABIRewriteContext::rewriteFunctionDefinition(
         unsigned runningIdx = sretOffset;
         for (unsigned i = 0; i < fc.argInfos.size(); ++i) {
           classToBlockArg[i] = runningIdx;
-          if (auto flatTy = getFlattenedCoercedType(fc.argInfos[i])) {
+          if (cir::RecordType flatTy =
+                  getFlattenedCoercedType(fc.argInfos[i])) {
             // Direct + canFlatten: N slots, one per coerced struct field.
             runningIdx += flatTy.getNumElements();
           } else if (fc.argInfos[i].kind == ArgKind::Expand) {
@@ -893,19 +907,31 @@ LogicalResult CIRABIRewriteContext::rewriteCallSite(
     if (ac.kind == ArgKind::Ignore)
       continue;
     Value arg = argOperands[idx];
-    if (auto flatTy = getFlattenedCoercedType(ac)) {
-      // Direct + canFlatten: coerce the struct to the ABI-coerced struct type
-      // and then extract each field as a separate call argument.  The coercion
-      // is a memory round-trip when the original and coerced types differ in
-      // layout; when they are the same CIR type the coercion is skipped.
-      Value coerced = arg;
-      if (arg.getType() != flatTy)
-        coerced = emitCoercion(builder, call.getLoc(), flatTy, arg,
-                               enclosingFunc, dl);
-      for (unsigned f = 0; f < flatTy.getNumElements(); ++f) {
-        Value field =
-            cir::ExtractMemberOp::create(builder, call.getLoc(), coerced, f);
-        newArgs.push_back(field);
+    if (cir::RecordType flatTy = getFlattenedCoercedType(ac)) {
+      // Direct + canFlatten: pass one scalar call argument per field of the
+      // ABI-coerced struct.  When the original and coerced types differ in
+      // layout, coerce through a memory slot and read each field with
+      // cir.get_member + cir.load from that slot, rather than loading the
+      // whole coerced struct and extracting members from the value.  When the
+      // types are already identical there is no backing slot (arg is a plain
+      // struct value), so extract each field directly from the value.
+      if (arg.getType() != flatTy) {
+        SmallPtrSet<Operation *, 4> coercionOps;
+        Value coercedPtr =
+            emitCoercionToMemory(builder, call.getLoc(), flatTy, arg,
+                                 enclosingFunc, dl, coercionOps);
+        for (auto [f, fieldTy] : llvm::enumerate(flatTy.getMembers())) {
+          Type fieldPtrTy = cir::PointerType::get(fieldTy);
+          auto fieldPtr =
+              cir::GetMemberOp::create(builder, call.getLoc(), fieldPtrTy,
+                                       coercedPtr, /*name=*/"", /*index=*/f);
+          newArgs.push_back(cir::LoadOp::create(builder, call.getLoc(), fieldTy,
+                                                fieldPtr.getResult()));
+        }
+      } else {
+        for (unsigned f = 0; f < flatTy.getNumElements(); ++f)
+          newArgs.push_back(
+              cir::ExtractMemberOp::create(builder, call.getLoc(), arg, f));
       }
     } else if (ac.kind == ArgKind::Expand) {
       // Decompose the struct value into its constituent scalar fields and
diff --git a/clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir b/clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir
index 97559715e9173..30a4385373c9a 100644
--- a/clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir
+++ b/clang/test/CIR/Transforms/abi-lowering/direct-flatten.cir
@@ -91,8 +91,9 @@ cir.func private @takes_two_long_decl(!rec_TwoLong)
 // CHECK:      cir.func{{.*}} @takes_two_long_decl(!s64i, !s64i)
 // CHECK-NOT:  !rec_TwoLong
 
-// Caller: the TwoLong operand is coerced to the anonymous struct type via a
-// memory round-trip, then each field is extracted as a separate call argument.
+// Caller: the TwoLong operand is coerced into a coerced-struct slot via a
+// memory round-trip, then each field is read from that slot with
+// get_member + load and passed as a separate call argument.
 
 cir.func @caller_two_long() attributes { test_classify = #passthrough } {
   %0 = cir.alloca !rec_TwoLong, !cir.ptr<!rec_TwoLong>, ["s"] {alignment = 8 : i64}
@@ -102,11 +103,16 @@ cir.func @caller_two_long() attributes { test_classify = #passthrough } {
 }
 
 // CHECK:      cir.func{{.*}} @caller_two_long()
+// CHECK:        %[[COERCE:.*]] = cir.alloca {{.*}} ["coerce"]
 // CHECK:        %[[VAL:.*]] = cir.load %{{.*}} : !cir.ptr<!rec_TwoLong>, !rec_TwoLong
-// Coerce TwoLong → coerced struct then extract fields.
-// CHECK:        %[[CV:.*]] = cir.load %{{.*}} : !cir.ptr<{{.*}}>, {{.*}}
-// CHECK:        %[[F0:.*]] = cir.extract_member %[[CV]][0] : {{.*}} -> !s64i
-// CHECK:        %[[F1:.*]] = cir.extract_member %[[CV]][1] : {{.*}} -> !s64i
+// Store the source value into the coerce slot, then read each field through a
+// coerced-struct view with get_member + load.
+// CHECK:        cir.store %[[VAL]], %[[COERCE]] : !rec_TwoLong, !cir.ptr<!rec_TwoLong>
+// CHECK:        %[[CAST:.*]] = cir.cast bitcast %[[COERCE]] : !cir.ptr<!rec_TwoLong> -> !cir.ptr<{{.*}}>
+// CHECK:        %[[P0:.*]] = cir.get_member %[[CAST]][0] {{.*}} -> !cir.ptr<!s64i>
+// CHECK:        %[[F0:.*]] = cir.load %[[P0]] : !cir.ptr<!s64i>, !s64i
+// CHECK:        %[[P1:.*]] = cir.get_member %[[CAST]][1] {{.*}} -> !cir.ptr<!s64i>
+// CHECK:        %[[F1:.*]] = cir.load %[[P1]] : !cir.ptr<!s64i>, !s64i
 // CHECK:        cir.call @takes_two_long(%[[F0]], %[[F1]]) : (!s64i, !s64i) -> ()
 
 // Three-field flatten (CUDA dim3 with 3 equal-width fields → 3 scalar args).
@@ -136,9 +142,16 @@ cir.func @caller_three_int() attributes { test_classify = #passthrough } {
 }
 
 // CHECK:      cir.func{{.*}} @caller_three_int()
-// CHECK:        %[[F0:.*]] = cir.extract_member %{{.*}}[0] : {{.*}} -> !s32i
-// CHECK:        %[[F1:.*]] = cir.extract_member %{{.*}}[1] : {{.*}} -> !s32i
-// CHECK:        %[[F2:.*]] = cir.extract_member %{{.*}}[2] : {{.*}} -> !s32i
+// CHECK:        %[[COERCE:.*]] = cir.alloca {{.*}} ["coerce"]
+// CHECK:        %[[VAL:.*]] = cir.load %{{.*}} : !cir.ptr<!rec_ThreeInt>, !rec_ThreeInt
+// CHECK:        cir.store %[[VAL]], %[[COERCE]] : !rec_ThreeInt, !cir.ptr<!rec_ThreeInt>
+// CHECK:        %[[CAST:.*]] = cir.cast bitcast %[[COERCE]] : !cir.ptr<!rec_ThreeInt> -> !cir.ptr<{{.*}}>
+// CHECK:        %[[P0:.*]] = cir.get_member %[[CAST]][0] {{.*}} -> !cir.ptr<!s32i>
+// CHECK:        %[[F0:.*]] = cir.load %[[P0]] : !cir.ptr<!s32i>, !s32i
+// CHECK:        %[[P1:.*]] = cir.get_member %[[CAST]][1] {{.*}} -> !cir.ptr<!s32i>
+// CHECK:        %[[F1:.*]] = cir.load %[[P1]] : !cir.ptr<!s32i>, !s32i
+// CHECK:        %[[P2:.*]] = cir.get_member %[[CAST]][2] {{.*}} -> !cir.ptr<!s32i>
+// CHECK:        %[[F2:.*]] = cir.load %[[P2]] : !cir.ptr<!s32i>, !s32i
 // CHECK:        cir.call @takes_three_int(%[[F0]], %[[F1]], %[[F2]]) : (!s32i, !s32i, !s32i) -> ()
 
 // Mixed-packing case (x86-64 SysV dim3 pattern): struct { uint x, y, z; }
@@ -167,9 +180,17 @@ cir.func @caller_dim3() attributes { test_classify = #passthrough } {
 }
 
 // CHECK:      cir.func{{.*}} @caller_dim3()
-// Coerce Dim3 → {i64, i32} then extract two fields.
-// CHECK:        %[[E0:.*]] = cir.extract_member %{{.*}}[0] : {{.*}} -> !s64i
-// CHECK:        %[[E1:.*]] = cir.extract_member %{{.*}}[1] : {{.*}} -> !s32i
+// Coerce Dim3 into the {i64, i32} slot, then read the two fields from the
+// slot with get_member + load.  The coerced struct is larger than Dim3, so
+// the slot is the coerced type and the bitcast is on the store (source) side.
+// CHECK:        %[[COERCE:.*]] = cir.alloca {{.*}} ["coerce"]
+// CHECK:        %[[VAL:.*]] = cir.load %{{.*}} : !cir.ptr<!rec_Dim3>, !rec_Dim3
+// CHECK:        %[[CAST:.*]] = cir.cast bitcast %[[COERCE]] : {{.*}} -> !cir.ptr<!rec_Dim3>
+// CHECK:        cir.store %[[VAL]], %[[CAST]] : !rec_Dim3, !cir.ptr<!rec_Dim3>
+// CHECK:        %[[P0:.*]] = cir.get_member %[[COERCE]][0] {{.*}} -> !cir.ptr<!s64i>
+// CHECK:        %[[E0:.*]] = cir.load %[[P0]] : !cir.ptr<!s64i>, !s64i
+// CHECK:        %[[P1:.*]] = cir.get_member %[[COERCE]][1] {{.*}} -> !cir.ptr<!s32i>
+// CHECK:        %[[E1:.*]] = cir.load %[[P1]] : !cir.ptr<!s32i>, !s32i
 // CHECK:        cir.call @takes_dim3(%[[E0]], %[[E1]]) : (!s64i, !s32i) -> ()
 
 // Two flatten args with an Ignore in between: exercises the Ignore-drop loop's