[flang-commits] [flang] [Flang] Hoisting constant-sized allocas at flang codegen. (PR #95310)
via flang-commits
flang-commits at lists.llvm.org
Wed Jun 12 13:52:43 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-codegen
@llvm/pr-subscribers-flang-fir-hlfir
Author: Vijay Kandiah (VijayKandiah)
<details>
<summary>Changes</summary>
This change modifies the `AllocaOpConversion` in flang codegen to insert constant-sized LLVM allocas at the entry block of `LLVMFuncOp` or OpenACC/OpenMP Op, rather than in-place at the `fir.alloca`. This effectively hoists constant-sized FIR allocas to the proper block.
When compiling the example subroutine below with `flang-new`, we get a llvm.stacksave/stackrestore pair around a constant-sized `fir.alloca i32`.
```
subroutine test(n)
block
integer :: n
print *, n
end block
end subroutine test
```
Without the proposed change, downstream LLVM compilation cannot hoist this constant-sized alloca out of the stacksave/stackrestore region which may lead to missed downstream optimizations:
```
*** IR Dump After Safe Stack instrumentation pass (safe-stack) ***
define void @<!-- -->test_(ptr %0) !dbg !3 {
%2 = call ptr @<!-- -->llvm.stacksave.p0(), !dbg !7
%3 = alloca i32, i64 1, align 4, !dbg !8
%4 = call ptr @<!-- -->_FortranAioBeginExternalListOutput(i32 6, ptr @<!-- -->_QQclX62c91d05f046c7a656e7978eb13f2821, i32 4), !dbg !9
%5 = load i32, ptr %3, align 4, !dbg !10, !tbaa !11
%6 = call i1 @<!-- -->_FortranAioOutputInteger32(ptr %4, i32 %5), !dbg !10
%7 = call i32 @<!-- -->_FortranAioEndIoStatement(ptr %4), !dbg !9
call void @<!-- -->llvm.stackrestore.p0(ptr %2), !dbg !15
ret void, !dbg !16
}
```
With this change, the `llvm.alloca` is already hoisted out of the stacksave/stackrestore region during flang codegen:
```
// -----// IR Dump After FIRToLLVMLowering (fir-to-llvm-ir) //----- //
llvm.func @<!-- -->test_(%arg0: !llvm.ptr {fir.bindc_name = "n"}) attributes {fir.internal_name = "_QPtest"} {
%0 = llvm.mlir.constant(4 : i32) : i32
%1 = llvm.mlir.constant(1 : i64) : i64
%2 = llvm.alloca %1 x i32 {bindc_name = "n"} : (i64) -> !llvm.ptr
%3 = llvm.mlir.constant(6 : i32) : i32
%4 = llvm.mlir.undef : i1
%5 = llvm.call @<!-- -->llvm.stacksave.p0() {fastmathFlags = #llvm.fastmath<contract>} : () -> !llvm.ptr
%6 = llvm.mlir.addressof @<!-- -->_QQclX62c91d05f046c7a656e7978eb13f2821 : !llvm.ptr
%7 = llvm.call @<!-- -->_FortranAioBeginExternalListOutput(%3, %6, %0) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, i32) -> !llvm.ptr
%8 = llvm.load %2 {tbaa = [#tbaa_tag]} : !llvm.ptr -> i32
%9 = llvm.call @<!-- -->_FortranAioOutputInteger32(%7, %8) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr, i32) -> i1
%10 = llvm.call @<!-- -->_FortranAioEndIoStatement(%7) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr) -> i32
llvm.call @<!-- -->llvm.stackrestore.p0(%5) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr) -> ()
llvm.return
}
```
---
Full diff: https://github.com/llvm/llvm-project/pull/95310.diff
3 Files Affected:
- (modified) flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h (+7-4)
- (modified) flang/lib/Optimizer/CodeGen/CodeGen.cpp (+14-4)
- (modified) flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp (+23-20)
``````````diff
diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
index 211acdc8a38e6..6ace73e2d16af 100644
--- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
+++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
@@ -51,7 +51,9 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
/// appropriate reified structures.
mlir::Value integerCast(mlir::Location loc,
mlir::ConversionPatternRewriter &rewriter,
- mlir::Type ty, mlir::Value val) const;
+ mlir::Type ty, mlir::Value val,
+ bool fold = false) const;
+
struct TypePair {
mlir::Type fir;
mlir::Type llvm;
@@ -144,9 +146,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
// Find the Block in which the alloca should be inserted.
// The order to recursively find the proper block:
// 1. An OpenMP Op that will be outlined.
- // 2. A LLVMFuncOp
- // 3. The first ancestor that is an OpenMP Op or a LLVMFuncOp
- mlir::Block *getBlockForAllocaInsert(mlir::Operation *op) const;
+ // 2. An OpenMP or OpenACC Op with one or more regions holding executable code.
+ // 3. A LLVMFuncOp
+ // 4. The first ancestor that is one of the above.
+ mlir::Block *getBlockForAllocaInsert(mlir::Operation *op, mlir::Region *parentRegion) const;
// Generate an alloca of size 1 for an object of type \p llvmObjectTy in the
// allocation address space provided for the architecture in the DataLayout
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 9f21c6b0cf097..d078a000ccd65 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -218,7 +218,7 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
chrTy.getContext(), chrTy.getFKind());
llvmObjectType = convertType(rawCharTy);
assert(end == 1);
- size = integerCast(loc, rewriter, ity, lenParams[0]);
+ size = integerCast(loc, rewriter, ity, lenParams[0], /*fold=*/true);
} else if (auto recTy = mlir::dyn_cast<fir::RecordType>(scalarType)) {
mlir::LLVM::LLVMFuncOp memSizeFn =
getDependentTypeMemSizeFn(recTy, alloc, rewriter);
@@ -236,17 +236,27 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
}
}
if (auto scaleSize = genAllocationScaleSize(alloc, ity, rewriter))
- size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+ size = rewriter.createOrFold<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
if (alloc.hasShapeOperands()) {
unsigned end = operands.size();
for (; i < end; ++i)
- size = rewriter.create<mlir::LLVM::MulOp>(
- loc, ity, size, integerCast(loc, rewriter, ity, operands[i]));
+ size = rewriter.createOrFold<mlir::LLVM::MulOp>(
+ loc, ity, size,
+ integerCast(loc, rewriter, ity, operands[i], /*fold=*/true));
}
unsigned allocaAs = getAllocaAddressSpace(rewriter);
unsigned programAs = getProgramAddressSpace(rewriter);
+ if (mlir::isa<mlir::LLVM::ConstantOp>(size.getDefiningOp())) {
+ // Set the Block in which the llvm alloca should be inserted.
+ mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
+ mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
+ mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp, parentRegion);
+ size.getDefiningOp()->moveAfter(insertBlock, insertBlock->begin());
+ rewriter.setInsertionPointAfter(size.getDefiningOp());
+ }
+
// NOTE: we used to pass alloc->getAttrs() in the builder for non opaque
// pointers! Only propagate pinned and bindc_name to help debugging, but
// this should have no functional purpose (and passing the operand segment
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 72e072db37432..6d86879cd3219 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -65,7 +65,7 @@ mlir::LLVM::ConstantOp ConvertFIRToLLVMPattern::genConstantOffset(
mlir::Value
ConvertFIRToLLVMPattern::integerCast(mlir::Location loc,
mlir::ConversionPatternRewriter &rewriter,
- mlir::Type ty, mlir::Value val) const {
+ mlir::Type ty, mlir::Value val, bool fold) const {
auto valTy = val.getType();
// If the value was not yet lowered, lower its type so that it can
// be used in getPrimitiveTypeSizeInBits.
@@ -73,10 +73,17 @@ ConvertFIRToLLVMPattern::integerCast(mlir::Location loc,
valTy = convertType(valTy);
auto toSize = mlir::LLVM::getPrimitiveTypeSizeInBits(ty);
auto fromSize = mlir::LLVM::getPrimitiveTypeSizeInBits(valTy);
- if (toSize < fromSize)
- return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
- if (toSize > fromSize)
- return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
+ if (fold) {
+ if (toSize < fromSize)
+ return rewriter.createOrFold<mlir::LLVM::TruncOp>(loc, ty, val);
+ if (toSize > fromSize)
+ return rewriter.createOrFold<mlir::LLVM::SExtOp>(loc, ty, val);
+ } else {
+ if (toSize < fromSize)
+ return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
+ if (toSize > fromSize)
+ return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
+ }
return val;
}
@@ -274,16 +281,19 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
// Find the Block in which the alloca should be inserted.
// The order to recursively find the proper block:
// 1. An OpenMP Op that will be outlined.
-// 2. A LLVMFuncOp
-// 3. The first ancestor that is an OpenMP Op or a LLVMFuncOp
-mlir::Block *
-ConvertFIRToLLVMPattern::getBlockForAllocaInsert(mlir::Operation *op) const {
+// 2. An OpenMP or OpenACC Op with one or more regions holding executable code.
+// 3. A LLVMFuncOp
+// 4. The first ancestor that is one of the above.
+mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
+ mlir::Operation *op, mlir::Region *parentRegion) const {
if (auto iface = mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
return iface.getAllocaBlock();
+ if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
+ return recipeIface.getAllocaBlock(*parentRegion);
if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
return &llvmFuncOp.front();
- return getBlockForAllocaInsert(op->getParentOp());
+ return getBlockForAllocaInsert(op->getParentOp(), parentRegion);
}
// Generate an alloca of size 1 for an object of type \p llvmObjectTy in the
@@ -297,16 +307,9 @@ mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
mlir::ConversionPatternRewriter &rewriter) const {
auto thisPt = rewriter.saveInsertionPoint();
mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
- if (mlir::isa<mlir::omp::DeclareReductionOp>(parentOp) ||
- mlir::isa<mlir::omp::PrivateClauseOp>(parentOp)) {
- // DeclareReductionOp & PrivateClauseOp have multiple child regions. We want
- // to get the first block of whichever of those regions we are currently in
- mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
- rewriter.setInsertionPointToStart(&parentRegion->front());
- } else {
- mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
- rewriter.setInsertionPointToStart(insertBlock);
- }
+ mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
+ mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp, parentRegion);
+ rewriter.setInsertionPointToStart(insertBlock);
auto size = genI32Constant(loc, rewriter, 1);
unsigned allocaAs = getAllocaAddressSpace(rewriter);
unsigned programAs = getProgramAddressSpace(rewriter);
``````````
</details>
https://github.com/llvm/llvm-project/pull/95310
More information about the flang-commits
mailing list