[flang-commits] [flang] [flang] Reset all extents to zero for empty hlfir.elemental loops. (PR #124867)

Tue Jan 28 17:21:18 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Slava Zakharin (vzakhari)

<details>
<summary>Changes</summary>

An hlfir.elemental with a shape `(0, HUGE)` still runs `HUGE`
number of iterations when expanded into a loop nest.
HLFIR transformational operations inlined as hlfir.elemental
may execute slower comparing to Fortran runtime implementation.
This patch adds an option for BufferizeHLFIR pass to reset all
upper bounds in the elemental loop nests to zero, if the result
is an empty array.

A separate patch will enable this option in the driver after I do
more performance testing. The option is off by default now.


---
Full diff: https://github.com/llvm/llvm-project/pull/124867.diff


5 Files Affected:

- (modified) flang/include/flang/Optimizer/Builder/FIRBuilder.h (+12) 
- (modified) flang/include/flang/Optimizer/HLFIR/Passes.td (+5) 
- (modified) flang/lib/Optimizer/Builder/FIRBuilder.cpp (+35) 
- (modified) flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp (+20-9) 
- (added) flang/test/HLFIR/elemental-with-empty-check-codegen.fir (+53) 


``````````diff

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index ea658fb16a36c3..f2252d04079cf2 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -813,6 +813,18 @@ uint64_t getAllocaAddressSpace(mlir::DataLayout *dataLayout);
 llvm::SmallVector<mlir::Value> deduceOptimalExtents(mlir::ValueRange extents1,
                                                     mlir::ValueRange extents2);
 
+/// Given array extents generate code that sets them all to zeroes,
+/// if the array is empty, e.g.:
+///   %false = arith.constant false
+///   %c0 = arith.constant 0 : index
+///   %p1 = arith.cmpi eq, %e0, %c0 : index
+///   %p2 = arith.ori %false, %p1 : i1
+///   %p3 = arith.cmpi eq, %e1, %c0 : index
+///   %p4 = arith.ori %p1, %p2 : i1
+///   %result0 = arith.select %p4, %c0, %e0 : index
+///   %result1 = arith.select %p4, %c0, %e1 : index
+llvm::SmallVector<mlir::Value> updateRuntimeExtentsForEmptyArrays(
+    fir::FirOpBuilder &builder, mlir::Location loc, mlir::ValueRange extents);
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 90cf6e74241bd0..eae0c9ca2e3661 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -19,6 +19,11 @@ def ConvertHLFIRtoFIR : Pass<"convert-hlfir-to-fir", "::mlir::ModuleOp"> {
 
 def BufferizeHLFIR : Pass<"bufferize-hlfir", "::mlir::ModuleOp"> {
   let summary = "Convert HLFIR operations operating on hlfir.expr into operations on memory";
+  let options = [Option<"optimizeEmptyElementals", "opt-empty-elementals",
+                        "bool", /*default=*/"false",
+                        "When converting hlfir.elemental into a loop nest, "
+                        "check if the resulting expression is an empty array, "
+                        "and make sure none of the loops is executed.">];
 }
 
 def OptimizedBufferization : Pass<"opt-bufferization"> {
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 35dc9a2abd69c4..b9056cad5c0d66 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1759,3 +1759,38 @@ fir::factory::deduceOptimalExtents(mlir::ValueRange extents1,
   }
   return extents;
 }
+
+llvm::SmallVector<mlir::Value> fir::factory::updateRuntimeExtentsForEmptyArrays(
+    fir::FirOpBuilder &builder, mlir::Location loc, mlir::ValueRange extents) {
+  if (extents.size() <= 1)
+    return extents;
+
+  // Try to reduce the number of new zero constant operations.
+  // This just makes MLIR matching easier, if CSE is not run
+  // after this.
+  llvm::DenseMap<mlir::Type, mlir::Value> zeroesMap;
+  mlir::Type i1Type = builder.getI1Type();
+  mlir::Value isEmpty = createZeroValue(builder, loc, i1Type);
+  zeroesMap.try_emplace(i1Type, isEmpty);
+
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> zeroes;
+  for (mlir::Value extent : extents) {
+    mlir::Type type = extent.getType();
+    mlir::Value zero = zeroesMap.lookup(type);
+    if (!zero) {
+      zero = createZeroValue(builder, loc, type);
+      zeroesMap.try_emplace(type, zero);
+    }
+    zeroes.push_back(zero);
+    mlir::Value isZero = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, extent, zero);
+    isEmpty = builder.create<mlir::arith::OrIOp>(loc, isEmpty, isZero);
+  }
+
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> newExtents;
+  for (auto [zero, extent] : llvm::zip_equal(zeroes, extents)) {
+    newExtents.push_back(
+        builder.create<mlir::arith::SelectOp>(loc, isEmpty, zero, extent));
+  }
+  return newExtents;
+}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 48e3db0eb39b67..34567598a50050 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -760,8 +760,10 @@ struct HLFIRListener : public mlir::OpBuilder::Listener {
 struct ElementalOpConversion
     : public mlir::OpConversionPattern<hlfir::ElementalOp> {
   using mlir::OpConversionPattern<hlfir::ElementalOp>::OpConversionPattern;
-  explicit ElementalOpConversion(mlir::MLIRContext *ctx)
-      : mlir::OpConversionPattern<hlfir::ElementalOp>{ctx} {
+  explicit ElementalOpConversion(mlir::MLIRContext *ctx,
+                                 bool optimizeEmptyElementals = false)
+      : mlir::OpConversionPattern<hlfir::ElementalOp>{ctx},
+        optimizeEmptyElementals(optimizeEmptyElementals) {
     // This pattern recursively converts nested ElementalOp's
     // by cloning and then converting them, so we have to allow
     // for recursive pattern application. The recursion is bounded
@@ -790,6 +792,10 @@ struct ElementalOpConversion
     // of the loop nest.
     temp = derefPointersAndAllocatables(loc, builder, temp);
 
+    if (optimizeEmptyElementals)
+      extents = fir::factory::updateRuntimeExtentsForEmptyArrays(builder, loc,
+                                                                 extents);
+
     // Generate a loop nest looping around the fir.elemental shape and clone
     // fir.elemental region inside the inner loop.
     hlfir::LoopNest loopNest =
@@ -860,6 +866,9 @@ struct ElementalOpConversion
     rewriter.replaceOp(elemental, bufferizedExpr);
     return mlir::success();
   }
+
+private:
+  bool optimizeEmptyElementals = false;
 };
 struct CharExtremumOpConversion
     : public mlir::OpConversionPattern<hlfir::CharExtremumOp> {
@@ -931,6 +940,8 @@ struct EvaluateInMemoryOpConversion
 
 class BufferizeHLFIR : public hlfir::impl::BufferizeHLFIRBase<BufferizeHLFIR> {
 public:
+  using BufferizeHLFIRBase<BufferizeHLFIR>::BufferizeHLFIRBase;
+
   void runOnOperation() override {
     // TODO: make this a pass operating on FuncOp. The issue is that
     // FirOpBuilder helpers may generate new FuncOp because of runtime/llvm
@@ -942,13 +953,13 @@ class BufferizeHLFIR : public hlfir::impl::BufferizeHLFIRBase<BufferizeHLFIR> {
     auto module = this->getOperation();
     auto *context = &getContext();
     mlir::RewritePatternSet patterns(context);
-    patterns
-        .insert<ApplyOpConversion, AsExprOpConversion, AssignOpConversion,
-                AssociateOpConversion, CharExtremumOpConversion,
-                ConcatOpConversion, DestroyOpConversion, ElementalOpConversion,
-                EndAssociateOpConversion, EvaluateInMemoryOpConversion,
-                NoReassocOpConversion, SetLengthOpConversion,
-                ShapeOfOpConversion, GetLengthOpConversion>(context);
+    patterns.insert<ApplyOpConversion, AsExprOpConversion, AssignOpConversion,
+                    AssociateOpConversion, CharExtremumOpConversion,
+                    ConcatOpConversion, DestroyOpConversion,
+                    EndAssociateOpConversion, EvaluateInMemoryOpConversion,
+                    NoReassocOpConversion, SetLengthOpConversion,
+                    ShapeOfOpConversion, GetLengthOpConversion>(context);
+    patterns.insert<ElementalOpConversion>(context, optimizeEmptyElementals);
     mlir::ConversionTarget target(*context);
     // Note that YieldElementOp is not marked as an illegal operation.
     // It must be erased by its parent converter and there is no explicit
diff --git a/flang/test/HLFIR/elemental-with-empty-check-codegen.fir b/flang/test/HLFIR/elemental-with-empty-check-codegen.fir
new file mode 100644
index 00000000000000..31275b4ac61430
--- /dev/null
+++ b/flang/test/HLFIR/elemental-with-empty-check-codegen.fir
@@ -0,0 +1,53 @@
+// Test hlfir.elemental code generation with a dynamic check
+// for empty result array
+// RUN: fir-opt %s --bufferize-hlfir=opt-empty-elementals=true | FileCheck %s
+
+func.func @test(%v: i32, %e0: i32, %e1: i32, %e2: i64, %e3: i64) {
+  %shape = fir.shape %e0, %e1, %e2, %e3 : (i32, i32, i64, i64) -> !fir.shape<4>
+  %result = hlfir.elemental %shape : (!fir.shape<4>) -> !hlfir.expr<?x?x?x?xi32> {
+  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+    hlfir.yield_element %v : i32
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test(
+// CHECK-SAME:                    %[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: i32,
+// CHECK-SAME:                    %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64) {
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (i32, i32, i64, i64) -> !fir.shape<4>
+// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_1]] : (i32) -> index
+// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_2]] : (i32) -> index
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+// CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
+// CHECK:           %[[VAL_10:.*]] = fir.allocmem !fir.array<?x?x?x?xi32>, %[[VAL_6]], %[[VAL_7]], %[[VAL_8]], %[[VAL_9]] {bindc_name = ".tmp.array", uniq_name = ""}
+// CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_5]]) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<?x?x?x?xi32>>, !fir.shape<4>) -> (!fir.box<!fir.array<?x?x?x?xi32>>, !fir.heap<!fir.array<?x?x?x?xi32>>)
+// CHECK:           %[[VAL_12:.*]] = arith.constant true
+// CHECK:           %[[VAL_13:.*]] = arith.constant false
+// CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_6]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_16:.*]] = arith.ori %[[VAL_13]], %[[VAL_15]] : i1
+// CHECK:           %[[VAL_17:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_18:.*]] = arith.ori %[[VAL_16]], %[[VAL_17]] : i1
+// CHECK:           %[[VAL_19:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_20:.*]] = arith.ori %[[VAL_18]], %[[VAL_19]] : i1
+// CHECK:           %[[VAL_21:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_22:.*]] = arith.ori %[[VAL_20]], %[[VAL_21]] : i1
+// CHECK:           %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_14]], %[[VAL_6]] : index
+// CHECK:           %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_14]], %[[VAL_7]] : index
+// CHECK:           %[[VAL_25:.*]] = arith.select %[[VAL_22]], %[[VAL_14]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_26:.*]] = arith.select %[[VAL_22]], %[[VAL_14]], %[[VAL_9]] : index
+// CHECK:           %[[VAL_27:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_28:.*]] = %[[VAL_27]] to %[[VAL_26]] step %[[VAL_27]] {
+// CHECK:             fir.do_loop %[[VAL_29:.*]] = %[[VAL_27]] to %[[VAL_25]] step %[[VAL_27]] {
+// CHECK:               fir.do_loop %[[VAL_30:.*]] = %[[VAL_27]] to %[[VAL_24]] step %[[VAL_27]] {
+// CHECK:                 fir.do_loop %[[VAL_31:.*]] = %[[VAL_27]] to %[[VAL_23]] step %[[VAL_27]] {
+// CHECK:                   %[[VAL_32:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_31]], %[[VAL_30]], %[[VAL_29]], %[[VAL_28]])  : (!fir.box<!fir.array<?x?x?x?xi32>>, index, index, index, index) -> !fir.ref<i32>
+// CHECK:                   hlfir.assign %[[VAL_0]] to %[[VAL_32]] temporary_lhs : i32, !fir.ref<i32>
+// CHECK:                 }
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_33:.*]] = fir.undefined tuple<!fir.box<!fir.array<?x?x?x?xi32>>, i1>
+// CHECK:           %[[VAL_34:.*]] = fir.insert_value %[[VAL_33]], %[[VAL_12]], [1 : index] : (tuple<!fir.box<!fir.array<?x?x?x?xi32>>, i1>, i1) -> tuple<!fir.box<!fir.array<?x?x?x?xi32>>, i1>
+// CHECK:           %[[VAL_35:.*]] = fir.insert_value %[[VAL_34]], %[[VAL_11]]#0, [0 : index] : (tuple<!fir.box<!fir.array<?x?x?x?xi32>>, i1>, !fir.box<!fir.array<?x?x?x?xi32>>) -> tuple<!fir.box<!fir.array<?x?x?x?xi32>>, i1>
+// CHECK:           return
+// CHECK:         }

``````````

</details>


https://github.com/llvm/llvm-project/pull/124867