[flang-commits] [flang] [flang][fir] always use memcpy for fir.box (PR #113949)

Mon Oct 28 12:18:54 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Asher Mancinelli (ashermancinelli)

<details>
<summary>Changes</summary>

@jeanPerier explained the importance of converting box loads and stores into `memcpy`s instead of aggregate loads and stores, and I'll do my best to explain it here.

* [(godbolt link) Example comparing opt transformations on memcpys vs aggregate load/stores](https://godbolt.org/z/be7xM83cG)
  * LLVM can more effectively reason about memcpys compared to aggregate load/stores.
  * This came up when others were discussing array descriptors for assumed-rank arrays passed to `bind(c)` subroutines, with the implication that the array descriptors are known to have lower bounds of 1 and that they are not pointer/allocatable types.
* [(godbolt link) Clang also uses memcpys so we should probably follow them, assuming the clang developers are generatign what they know Opt will handle more effectively.](https://godbolt.org/z/YT4x7387W)
* This currently may not help much without the `nocapture` attribute being propagated to function calls, but [it looks like someone may do this soon (discourse link)](https://discourse.llvm.org/t/applying-the-nocapture-attribute-to-reference-passed-arguments-in-fortran-subroutines/81401/23) or I can do this in a follow-up patch.

Note on test `flang/test/Fir/embox-char.fir`: it looks like the original test was auto-generated. I wasn't too sure which parts were especially important to test, so I regenerated the test. If we want the updated version to look more like the old version, I'll make those changes.

---

Patch is 57.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113949.diff


11 Files Affected:

- (modified) flang/lib/Optimizer/CodeGen/CodeGen.cpp (+20-39) 
- (modified) flang/test/Fir/box.fir (+13-6) 
- (modified) flang/test/Fir/convert-to-llvm-openmp-and-fir.fir (+2-2) 
- (modified) flang/test/Fir/convert-to-llvm.fir (+15-13) 
- (modified) flang/test/Fir/embox-char.fir (+121-118) 
- (modified) flang/test/Fir/polymorphic.fir (+4-8) 
- (modified) flang/test/Fir/tbaa.fir (+2-2) 
- (modified) flang/test/Integration/OpenMP/private-global.f90 (+3-2) 
- (modified) flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 (+1-2) 
- (modified) flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 (+1-1) 
- (modified) flang/test/Lower/allocatable-polymorphic.f90 (+6-12) 


``````````diff

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index e6eeb0d5db4a84..4c8c56e0f21cef 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2949,9 +2949,10 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
   llvm::LogicalResult
   matchAndRewrite(fir::LoadOp load, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
+
     mlir::Type llvmLoadTy = convertObjectType(load.getType());
     if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(load.getType())) {
-      // fir.box is a special case because it is considered as an ssa values in
+      // fir.box is a special case because it is considered an ssa value in
       // fir, but it is lowered as a pointer to a descriptor. So
       // fir.ref<fir.box> and fir.box end up being the same llvm types and
       // loading a fir.ref<fir.box> is implemented as taking a snapshot of the
@@ -2960,30 +2961,17 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
       mlir::Location loc = load.getLoc();
       auto newBoxStorage =
           genAllocaAndAddrCastWithType(loc, llvmLoadTy, defaultAlign, rewriter);
-      // TODO: always generate llvm.memcpy, LLVM is better at optimizing it than
-      // aggregate loads + stores.
-      if (boxTy.isAssumedRank()) {
-
-        TypePair boxTypePair{boxTy, llvmLoadTy};
-        mlir::Value boxSize =
-            computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter);
-        auto memcpy = rewriter.create<mlir::LLVM::MemcpyOp>(
-            loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false);
-        if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
-          memcpy.setTBAATags(*optionalTag);
-        else
-          attachTBAATag(memcpy, boxTy, boxTy, nullptr);
-      } else {
-        auto boxValue = rewriter.create<mlir::LLVM::LoadOp>(loc, llvmLoadTy,
-                                                            inputBoxStorage);
-        if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
-          boxValue.setTBAATags(*optionalTag);
-        else
-          attachTBAATag(boxValue, boxTy, boxTy, nullptr);
-        auto storeOp =
-            rewriter.create<mlir::LLVM::StoreOp>(loc, boxValue, newBoxStorage);
-        attachTBAATag(storeOp, boxTy, boxTy, nullptr);
-      }
+
+      TypePair boxTypePair{boxTy, llvmLoadTy};
+      mlir::Value boxSize =
+          computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter);
+      auto memcpy = rewriter.create<mlir::LLVM::MemcpyOp>(
+          loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false);
+
+      if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
+        memcpy.setTBAATags(*optionalTag);
+      else
+        attachTBAATag(memcpy, boxTy, boxTy, nullptr);
       rewriter.replaceOp(load, newBoxStorage);
     } else {
       auto loadOp = rewriter.create<mlir::LLVM::LoadOp>(
@@ -3227,20 +3215,13 @@ struct StoreOpConversion : public fir::FIROpConversion<fir::StoreOp> {
     mlir::LLVM::AliasAnalysisOpInterface newOp;
     if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(storeTy)) {
       mlir::Type llvmBoxTy = lowerTy().convertBoxTypeAsStruct(boxTy);
-      // fir.box value is actually in memory, load it first before storing it,
-      // or do a memcopy for assumed-rank descriptors.
-      if (boxTy.isAssumedRank()) {
-        TypePair boxTypePair{boxTy, llvmBoxTy};
-        mlir::Value boxSize =
-            computeBoxSize(loc, boxTypePair, llvmValue, rewriter);
-        newOp = rewriter.create<mlir::LLVM::MemcpyOp>(
-            loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false);
-      } else {
-        auto val =
-            rewriter.create<mlir::LLVM::LoadOp>(loc, llvmBoxTy, llvmValue);
-        attachTBAATag(val, boxTy, boxTy, nullptr);
-        newOp = rewriter.create<mlir::LLVM::StoreOp>(loc, val, llvmMemref);
-      }
+      // Always use memcpy because LLVM is not as effective at optimizing
+      // aggregate loads/stores as it is optimizing memcpy.
+      TypePair boxTypePair{boxTy, llvmBoxTy};
+      mlir::Value boxSize =
+          computeBoxSize(loc, boxTypePair, llvmValue, rewriter);
+      newOp = rewriter.create<mlir::LLVM::MemcpyOp>(
+          loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false);
     } else {
       newOp = rewriter.create<mlir::LLVM::StoreOp>(loc, llvmValue, llvmMemref);
     }
diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir
index 81a4d8bc13bf01..fd9fa1f2b3aabd 100644
--- a/flang/test/Fir/box.fir
+++ b/flang/test/Fir/box.fir
@@ -56,12 +56,14 @@ func.func @fa(%a : !fir.ref<!fir.array<100xf32>>) {
 // CHECK-LABEL: define void @b1(
 // CHECK-SAME: ptr %[[res:.*]], ptr %[[arg0:.*]], i64 %[[arg1:.*]])
 func.func @b1(%arg0 : !fir.ref<!fir.char<1,?>>, %arg1 : index) -> !fir.box<!fir.char<1,?>> {
+  // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
   // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]]
   // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
   %x = fir.embox %arg0 typeparams %arg1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
-  // CHECK: store {{.*}}, ptr %[[res]]
+  // CHECK: store {{.*}}, ptr %[[alloca]]
+  // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 24, i1 false)
   return %x : !fir.box<!fir.char<1,?>>
 }
 
@@ -71,11 +73,13 @@ func.func @b1(%arg0 : !fir.ref<!fir.char<1,?>>, %arg1 : index) -> !fir.box<!fir.
 // CHECK-SAME: ptr %[[arg0:.*]], i64 %[[arg1:.*]])
 func.func @b2(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,5>>>, %arg1 : index) -> !fir.box<!fir.array<?x!fir.char<1,5>>> {
   %1 = fir.shape %arg1 : (index) -> !fir.shape<1>
+  // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
   // CHECK: insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), i32 20240719, i8 1, i8 40, i8 0, i8 0, {{.*}} }, i64 %[[arg1]], 7, 0, 1
   // CHECK: insertvalue {{.*}} %{{.*}}, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), 7, 0, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
   %2 = fir.embox %arg0(%1) : (!fir.ref<!fir.array<?x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<?x!fir.char<1,5>>>
-  // CHECK: store {{.*}}, ptr %[[res]]
+  // CHECK: store {{.*}}, ptr %[[alloca]]
+  // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false)
   return %2 : !fir.box<!fir.array<?x!fir.char<1,5>>>
 }
 
@@ -84,6 +88,7 @@ func.func @b2(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,5>>>, %arg1 : index) ->
 // CHECK-SAME: ptr %[[res:.*]], ptr %[[arg0:.*]], i64 %[[arg1:.*]], i64 %[[arg2:.*]])
 func.func @b3(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,?>>>, %arg1 : index, %arg2 : index) -> !fir.box<!fir.array<?x!fir.char<1,?>>> {
   %1 = fir.shape %arg2 : (index) -> !fir.shape<1>
+  // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
   // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]]
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
@@ -91,7 +96,8 @@ func.func @b3(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,?>>>, %arg1 : index, %ar
   // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
   %2 = fir.embox %arg0(%1) typeparams %arg1 : (!fir.ref<!fir.array<?x!fir.char<1,?>>>, !fir.shape<1>, index) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
-  // CHECK: store {{.*}}, ptr %[[res]]
+  // CHECK: store {{.*}}, ptr %[[alloca]]
+  // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false)
   return %2 : !fir.box<!fir.array<?x!fir.char<1,?>>>
 }
 
@@ -101,6 +107,7 @@ func.func @b3(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,?>>>, %arg1 : index, %ar
 func.func @b4(%arg0 : !fir.ref<!fir.array<7x!fir.char<1,?>>>, %arg1 : index) -> !fir.box<!fir.array<7x!fir.char<1,?>>> {
   %c_7 = arith.constant 7 : index
   %1 = fir.shape %c_7 : (index) -> !fir.shape<1>
+  // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
   // CHECK:   %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]]
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
@@ -108,7 +115,8 @@ func.func @b4(%arg0 : !fir.ref<!fir.array<7x!fir.char<1,?>>>, %arg1 : index) ->
   // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
   %x = fir.embox %arg0(%1) typeparams %arg1 : (!fir.ref<!fir.array<7x!fir.char<1,?>>>, !fir.shape<1>, index) -> !fir.box<!fir.array<7x!fir.char<1,?>>>
-  // CHECK: store {{.*}}, ptr %[[res]]
+  // CHECK: store {{.*}}, ptr %[[alloca]]
+  // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false)
   return %x : !fir.box<!fir.array<7x!fir.char<1,?>>>
 }
 
@@ -117,8 +125,7 @@ func.func @b4(%arg0 : !fir.ref<!fir.array<7x!fir.char<1,?>>>, %arg1 : index) ->
 // CHECK-SAME: ptr %[[arg0:.*]], ptr %[[arg1:.*]])
 func.func @b5(%arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, %arg1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {
   fir.store %arg1 to %arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
-  // CHECK: %[[boxLoad:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[arg1]]
-  // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %[[boxLoad]], ptr %[[arg0]]
+  // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %0, ptr %1, i32 72, i1 false)
   return
 }
 
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 335877e7c9a872..168526518865b4 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -799,8 +799,8 @@ func.func @_QPs(%arg0: !fir.ref<complex<f32>> {fir.bindc_name = "x"}) {
 //CHECK:  omp.parallel   {
 //CHECK:    %[[CONST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
 //CHECK:    %[[ALLOCA_1:.*]] = llvm.alloca %[[CONST_1:.*]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-//CHECK:    %[[LOAD:.*]] = llvm.load %[[ALLOCA]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-//CHECK:    llvm.store %[[LOAD]], %[[ALLOCA_1]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr
+//CHECK:    %[[SIZE:.*]] = llvm.mlir.constant(24 : i32) : i32
+//CHECK:    "llvm.intr.memcpy"(%[[ALLOCA_1]], %[[ALLOCA]], %[[SIZE]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
 //CHECK:    %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA_1]][0, 0] : (!llvm.ptr) -> !llvm.ptr
 //CHECK:    %[[LOAD_2:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> !llvm.ptr
 //CHECK:    omp.terminator
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 1182a0a10f218b..fa391fa6cc7a7d 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -862,8 +862,8 @@ func.func @test_store_box(%array : !fir.ref<!fir.box<!fir.array<?x?xf32>>>, %box
 // CHECK-LABEL:  llvm.func @test_store_box
 // CHECK-SAME:  (%[[arg0:.*]]: !llvm.ptr,
 // CHECK-SAME:  %[[arg1:.*]]: !llvm.ptr) {
-// CHECK-NEXT:  %[[box_to_store:.*]] = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i{{.*}}>>)>
-// CHECK-NEXT:  llvm.store %[[box_to_store]], %[[arg0]] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i{{.*}}>>)>, !llvm.ptr
+// CHECK-NEXT:  %[[size:.*]] = llvm.mlir.constant(72 : i32) : i32
+// CHECK-NEXT:  "llvm.intr.memcpy"(%[[arg0]], %[[arg1]], %[[size]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
 // CHECK-NEXT:  llvm.return
 // CHECK-NEXT:  }
 
@@ -875,15 +875,17 @@ func.func @store_unlimited_polymorphic_box(%arg0 : !fir.class<none>, %arg1 : !fi
   fir.store %arg3 to %arg3r : !fir.ref<!fir.box<!fir.array<?xnone>>>
   return
 }
-// CHECK-LABEL:   llvm.func @store_unlimited_polymorphic_box(
-// CHECK:  %[[VAL_8:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>
-// CHECK:  llvm.store %[[VAL_8]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>, !llvm.ptr
-// CHECK:  %[[VAL_9:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)>
-// CHECK:  llvm.store %[[VAL_9]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)>, !llvm.ptr
-// CHECK:  %[[VAL_10:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>
-// CHECK:  llvm.store %[[VAL_10]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>, !llvm.ptr
-// CHECK:  %[[VAL_11:.*]] = llvm.load %{{.*}}: !llvm.ptr
-// CHECK:  llvm.store %[[VAL_11]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)>, !llvm.ptr
+// CHECK:   llvm.func @store_unlimited_polymorphic_box(%[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: !llvm.ptr, %[[VAL_2:.*]]: !llvm.ptr, %[[VAL_3:.*]]: !llvm.ptr, %[[VAL_4:.*]]: !llvm.ptr, %[[VAL_5:.*]]: !llvm.ptr, %[[VAL_6:.*]]: !llvm.ptr, %[[VAL_7:.*]]: !llvm.ptr) {
+// CHECK:     %[[VAL_8:.*]] = llvm.mlir.constant(40 : i32) : i32
+// CHECK:     "llvm.intr.memcpy"(%[[VAL_4]], %[[VAL_0]], %[[VAL_8]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK:     %[[VAL_9:.*]] = llvm.mlir.constant(64 : i32) : i32
+// CHECK:     "llvm.intr.memcpy"(%[[VAL_5]], %[[VAL_1]], %[[VAL_9]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK:     %[[VAL_10:.*]] = llvm.mlir.constant(40 : i32) : i32
+// CHECK:     "llvm.intr.memcpy"(%[[VAL_6]], %[[VAL_2]], %[[VAL_10]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK:     %[[VAL_11:.*]] = llvm.mlir.constant(64 : i32) : i32
+// CHECK:     "llvm.intr.memcpy"(%[[VAL_7]], %[[VAL_3]], %[[VAL_11]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK:     llvm.return
+// CHECK:   }
 
 
 // -----
@@ -935,8 +937,8 @@ func.func @test_load_box(%addr : !fir.ref<!fir.box<!fir.array<10xf32>>>) {
 // GENERIC-NEXT:  %[[box_copy:.*]] = llvm.alloca %[[c1]] x !llvm.struct<([[DESC_TYPE:.*]])>
 // AMDGPU-NEXT:   %[[alloca_box_copy:.*]] = llvm.alloca %[[c1]] x !llvm.struct<([[DESC_TYPE:.*]])>{{.*}} : (i32) -> !llvm.ptr<5>
 // AMDGPU-NEXT:   %[[box_copy:.*]] = llvm.addrspacecast %[[alloca_box_copy]] : !llvm.ptr<5> to !llvm.ptr
-// CHECK-NEXT:    %[[box_val:.*]] = llvm.load %[[arg0]] : !llvm.ptr -> !llvm.struct<([[DESC_TYPE]])>
-// CHECK-NEXT:    llvm.store %[[box_val]], %[[box_copy]] : !llvm.struct<([[DESC_TYPE]])>, !llvm.ptr
+// CHECK-NEXT:    %[[size:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK-NEXT:    "llvm.intr.memcpy"(%[[box_copy]], %[[arg0]], %[[size]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
 // CHECK-NEXT:    llvm.call @takes_box(%[[box_copy]]) : (!llvm.ptr) -> ()
 // CHECK-NEXT:    llvm.return
 // CHECK-NEXT:  }
diff --git a/flang/test/Fir/embox-char.fir b/flang/test/Fir/embox-char.fir
index bf8344dbb60fc8..efb069f96520d4 100644
--- a/flang/test/Fir/embox-char.fir
+++ b/flang/test/Fir/embox-char.fir
@@ -1,3 +1,10 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+// The script is designed to make adding checks to
+// a test case fast, it is *not* designed to be authoritative
+// about what constitutes a good test! The CHECK should be
+// minimized and named to reflect the test intent.
+
 // Test that the offset of the first element of the slice
 // is computed in elements of the type used for the GEP
 // computing the base of the slice.
@@ -10,42 +17,40 @@
 //   print *, x(2,:)
 // end subroutine
 
-// CHECK-LABEL:   llvm.func @test_char4(
-// CHECK-SAME:        %[[VAL_0:.*]]: !llvm.ptr,
-// CHECK-SAME:        %[[VAL_1_SLICE_LB0:.*]]: i64, %[[VAL_2_SLICE_EX0:.*]]: i64, %[[VAL_3_SLICE_ST0:.*]]: i64, %[[VAL_4_SLICE_LB1:.*]]: i64, %[[VAL_5_SLICE_EX1:.*]]: i64, %[[VAL_6_SLICE_ST1:.*]]: i64) {
+// CHECK:   llvm.func @test_char4(%[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[VAL_5:.*]]: i64, %[[VAL_6:.*]]: i64) {
 // CHECK:           %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:           %[[VAL_8:.*]] = llvm.alloca %[[VAL_7]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
 // CHECK:           %[[VAL_9:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:           %[[VAL_10:.*]] = llvm.alloca %[[VAL_9]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
 // CHECK:           %[[VAL_11:.*]] = llvm.mlir.constant(0 : index) : i64
 // CHECK:           %[[VAL_12:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK:           %[[VAL_13_WIDTH:.*]] = llvm.mlir.constant(4 : index) : i64
-// CHECK:           %[[VAL_14:.*]] = llvm.load %[[VAL_0]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           llvm.store %[[VAL_14]], %[[VAL_10]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr
+// CHECK:           %[[VAL_13:.*]] = llvm.mlir.constant(4 : index) : i64
+// CHECK:           %[[VAL_14:.*]] = llvm.mlir.constant(72 : i32) : i32
+// CHECK:           "llvm.intr.memcpy"(%[[VAL_10]], %[[VAL_0]], %[[VAL_14]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
 // CHECK:           %[[VAL_15:.*]] = llvm.getelementptr %[[VAL_10]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_16_BYTESIZE:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_16:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64
 // CHECK:           %[[VAL_17:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_18_LB1:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_18:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64
 // CHECK:           %[[VAL_19:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_20_EX1:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_20:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64
 // CHECK:           %[[VAL_21:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_22_ST1:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_22:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64
 // CHECK:           %[[VAL_23:.*]] = llvm.getelementptr %[[VAL_10]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_24_BASEPTR:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr
+// CHECK:           %[[VAL_24:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr
 // CHECK:           %[[VAL_25:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
-// CHECK:           %[[VAL_26_LB0:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_26:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64
 // CHECK:           %[[VAL_27:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x arr...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/113949