[llvm] [LICM] Promote conditional, loop-invariant memory accesses to scalars with intrinsic (PR #93999)

Ivan Shumakov via llvm-commits llvm-commits at lists.llvm.org
Fri May 31 11:03:32 PDT 2024


https://github.com/ii-sc created https://github.com/llvm/llvm-project/pull/93999

There is a missed opportunity to promote conditional store in the LICM pass. It has been implemented in this old patch: 

https://reviews.llvm.org/D115244

This patch has a minor flaw: on some architectures masked store intrinsic lowers to vector instructions. For example, on RISC-V architecture this code
```cpp
int res;

void test(int * restrict a, int N) {
    for (int i = 0; i < N; ++i)
        if (a[i])
            ++res;
}
```
translates to the assembler with vector instructions:
```asm
.LBB0_2:                                # %for.cond.cleanup
	andi	a4, a4, 1
	vsetivli	zero, 1, e32, mf2, ta, ma
	vmv.v.x	v9, a4
	vmv.v.x	v8, a3
	vsetvli	zero, zero, e8, mf8, ta, ma
	vmsne.vi	v0, v9, 0
	vse32.v	v8, (a2), v0.t
```
which is unnecessary here. 

I have implemented `conditional_store` intrinsic for this patch in addition to the original one. 

>From 59a0f70995a1815df45da0144ebdff92e20d259a Mon Sep 17 00:00:00 2001
From: Ivan Shumakov <ivan.shumakov at syntacore.com>
Date: Mon, 20 May 2024 12:05:39 +0300
Subject: [PATCH 1/2] [LLVM] Store promotion possibility

---
 ...conditional-store-promotion-possibility.ll | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll

diff --git a/llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll b/llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll
new file mode 100644
index 0000000000000..b694ea102ff09
--- /dev/null
+++ b/llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+ at res = dso_local local_unnamed_addr global i32 0, align 4
+
+define dso_local void @test(ptr noalias nocapture noundef readonly %a, i32 noundef signext %N) local_unnamed_addr #0 {
+  ; Preheader:
+  entry:
+    br label %for.cond
+
+  ; Loop:
+  for.cond:                                         ; preds = %for.inc, %entry
+    %i.0 = phi i32 [ 0, %entry ], [ %inc1, %for.inc ]
+    %cmp = icmp slt i32 %i.0, %N
+    br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+  for.body:                                         ; preds = %for.cond
+    %idxprom = zext i32 %i.0 to i64
+    %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+    %0 = load i32, ptr %arrayidx, align 4
+    %tobool.not = icmp eq i32 %0, 0
+    br i1 %tobool.not, label %for.inc, label %if.then
+
+  if.then:                                          ; preds = %for.body
+    %1 = load i32, ptr @res, align 4
+    %inc = add nsw i32 %1, 1
+    store i32 %inc, ptr @res, align 4
+    br label %for.inc
+
+  for.inc:                                          ; preds = %for.body, %if.then
+    %inc1 = add nuw nsw i32 %i.0, 1
+    br label %for.cond 
+
+  ; Exit blocks
+  for.cond.cleanup:                                 ; preds = %for.cond
+    ret void
+}
+
+; CHECK:  entry:
+; CHECK:    %res.promoted = load i32, ptr @res, align 4
+; CHECK:    br label %for.cond
+
+; CHECK:  for.cond:
+; CHECK:    %inc3 = phi i32 [ %res.promoted, %entry ], [ %inc2, %for.inc ]
+; CHECK:    %i.0 = phi i32 [ 0, %entry ], [ %inc1, %for.inc ]
+; CHECK:    %cmp = icmp slt i32 %i.0, %N
+; CHECK:    br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+; CHECK:  for.body:
+; CHECK:    %idxprom = zext i32 %i.0 to i64
+; CHECK:    %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+; CHECK:    %0 = load i32, ptr %arrayidx, align 4
+; CHECK:    %tobool.not = icmp eq i32 %0, 0
+; CHECK:    br i1 %tobool.not, label %for.inc, label %if.then
+
+; CHECK:  if.then:
+; CHECK:    %inc = add nsw i32 %inc3, 1
+; CHECK:    store i32 %inc, ptr @res, align 4
+; CHECK:    br label %for.inc
+
+; CHECK:  for.inc:
+; CHECK:    %inc2 = phi i32 [ %inc, %if.then ], [ %inc3, %for.body ]
+; CHECK:    %inc1 = add nuw nsw i32 %i.0, 1
+; CHECK:    br label %for.cond
+
+; CHECK:  for.cond.cleanup:
+  ; CHECK:    ret void
+; CHECK:  }
\ No newline at end of file

>From 28ead3aefcf9cf1e56fcd47f3c0f71f7c29422ec Mon Sep 17 00:00:00 2001
From: Ivan Shumakov <ivan.shumakov at syntacore.com>
Date: Mon, 8 Apr 2024 19:18:22 +0300
Subject: [PATCH 2/2] [LLVM] Conditional store promotion with corresponding
 intrinsic have been added

---
 llvm/include/llvm/IR/IRBuilder.h              |   4 +
 llvm/include/llvm/IR/Intrinsics.td            |  15 +++
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/include/llvm/LinkAllPasses.h             |   1 +
 llvm/include/llvm/Transforms/Scalar.h         |   5 +
 .../Scalar/LowerConditionalStoreIntrinsic.h   |  30 +++++
 llvm/lib/IR/IRBuilder.cpp                     |  16 +++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  10 +-
 llvm/lib/Passes/PassRegistry.def              |   1 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |   3 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   6 +-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp  |   3 +
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |   1 +
 llvm/lib/Transforms/Scalar/LICM.cpp           |  69 ++++++++++-
 .../Scalar/LowerConditionalStoreIntrinsic.cpp | 115 ++++++++++++++++++
 llvm/lib/Transforms/Scalar/Scalar.cpp         |   1 +
 .../LICM/conditional-store-intrinsic.ll       |  22 ++++
 ...conditional-store-promotion-possibility.ll |  18 ++-
 .../LICM/promote-conditional-store-intr.ll    |  69 +++++++++++
 20 files changed, 380 insertions(+), 11 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Scalar/LowerConditionalStoreIntrinsic.h
 create mode 100644 llvm/lib/Transforms/Scalar/LowerConditionalStoreIntrinsic.cpp
 create mode 100644 llvm/test/Transforms/LICM/conditional-store-intrinsic.ll
 create mode 100644 llvm/test/Transforms/LICM/promote-conditional-store-intr.ll

diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 40a9cf507248a..06fc284c7a824 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -818,6 +818,10 @@ class IRBuilderBase {
   /// Create a call to llvm.threadlocal.address intrinsic.
   CallInst *CreateThreadLocalAddress(Value *Ptr);
 
+  // Create a call to a Conditional Store intrinsic
+  CallInst *CreateConditionalStore(Value *Val, Value *Ptr, Align Alignment,
+                                   Value *Condition);
+
   /// Create a call to Masked Load intrinsic
   CallInst *CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask,
                              Value *PassThru = nullptr, const Twine &Name = "");
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 107442623ab7b..95a9f6cc04de2 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2322,6 +2322,21 @@ def int_vp_is_fpclass:
                                 llvm_i32_ty],
                               [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
 
+//===-------------------------- Conditional Intrinsics --------------------===//
+//
+
+def int_conditional_store:
+  DefaultAttrsIntrinsic</*ret_types*/[],
+                        /*param_types*/[/*Val*/llvm_any_ty,
+                                        /*Ptr*/llvm_anyptr_ty,
+                                        /*Alignment*/llvm_i32_ty,
+                                        /*Condition*/llvm_i1_ty],
+                        /*intr_properties*/[IntrWriteMem,
+                                            IntrArgMemOnly,
+                                            IntrWillReturn,
+                 /*Alignment is a constant*/ImmArg<ArgIndex<2>>,
+                                            NoCapture<ArgIndex<1>>]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_load:
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 9ba75d491c1c9..c36f035e00bdc 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -169,6 +169,7 @@ void initializeLoopUnrollPass(PassRegistry&);
 void initializeLowerAtomicLegacyPassPass(PassRegistry&);
 void initializeLowerConstantIntrinsicsPass(PassRegistry&);
 void initializeLowerEmuTLSPass(PassRegistry&);
+void initializeLowerConditionalStoreIntrinsicLegacyPass(PassRegistry &);
 void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
 void initializeLowerIntrinsicsPass(PassRegistry&);
 void initializeLowerInvokeLegacyPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 30e7c22f31460..042a0e8768380 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -90,6 +90,7 @@ namespace {
       (void) llvm::createLoopStrengthReducePass();
       (void) llvm::createLoopUnrollPass();
       (void) llvm::createLowerConstantIntrinsicsPass();
+      (void)llvm::createLowerConditionalStoreIntrinsicPass();
       (void) llvm::createLowerGlobalDtorsLegacyPass();
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index f74a49785e11b..962f5c75a01e6 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -143,6 +143,11 @@ Pass *createMergeICmpsLegacyPass();
 FunctionPass *createInferAddressSpacesPass(unsigned AddressSpace = ~0u);
 extern char &InferAddressSpacesID;
 
+//===----------------------------------------------------------------------===//
+//
+//  Lower conditional store intrinsic
+FunctionPass *createLowerConditionalStoreIntrinsicPass();
+
 //===----------------------------------------------------------------------===//
 //
 // TLSVariableHoist - This pass reduce duplicated TLS address call.
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerConditionalStoreIntrinsic.h b/llvm/include/llvm/Transforms/Scalar/LowerConditionalStoreIntrinsic.h
new file mode 100644
index 0000000000000..f3b6f6ce2a185
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LowerConditionalStoreIntrinsic.h
@@ -0,0 +1,30 @@
+//===- LowerConditionalStoreIntrinsic.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+///  Pass for early lowering of conditional store.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWERCONDSTOREINTRINSIC_H
+#define LLVM_TRANSFORMS_SCALAR_LOWERCONDSTOREINTRINSIC_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+
+struct LowerConditionalStoreIntrinsicPass
+    : PassInfoMixin<LowerConditionalStoreIntrinsicPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+};
+
+} // namespace llvm
+
+#endif
\ No newline at end of file
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index b32799355d692..4a0ba04a86e24 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -566,6 +566,22 @@ Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) {
   return CreateCall(FnIntrinsic, {Scope});
 }
 
+/// Create a call to a Conditional Store intrinsic.
+/// \p Val       - data to be stored,
+/// \p Ptr       - base pointer for the store
+/// \p Alignment - alignment of the destination location
+/// \p Condition - boolean that indicates if store should be performed
+CallInst *IRBuilderBase::CreateConditionalStore(Value *Val, Value *Ptr,
+                                                Align Alignment,
+                                                Value *Condition) {
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
+  Type *DataTy = Val->getType();
+  Type *OverloadedTypes[] = {DataTy, PtrTy};
+  Value *Ops[] = {Val, Ptr, getInt32(Alignment.value()), Condition};
+  return CreateMaskedIntrinsic(Intrinsic::conditional_store, Ops,
+                               OverloadedTypes);
+}
+
 /// Create a call to a Masked Load intrinsic.
 /// \p Ty        - vector type to load
 /// \p Ptr       - base pointer for the load
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 734ca4d5deec9..fe6b75c3b8edd 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -232,6 +232,7 @@
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
 #include "llvm/Transforms/Scalar/LowerAtomicPass.h"
+#include "llvm/Transforms/Scalar/LowerConditionalStoreIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1892e16a06528..382c57c0375e0 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
+#include "llvm/Transforms/Scalar/LowerConditionalStoreIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
@@ -499,7 +500,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
                                               /*UseMemorySSA=*/false,
                                               /*UseBlockFrequencyInfo=*/false));
-
+  FPM.addPass(LowerConditionalStoreIntrinsicPass());
   // Delete small array after loop unroll.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
 
@@ -691,7 +692,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
                                               /*UseMemorySSA=*/false,
                                               /*UseBlockFrequencyInfo=*/false));
-
+  FPM.addPass(LowerConditionalStoreIntrinsicPass());
   // Delete small array after loop unroll.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
 
@@ -744,7 +745,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                /*AllowSpeculation=*/true),
       /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
-
+  FPM.addPass(LowerConditionalStoreIntrinsicPass());
   FPM.addPass(CoroElidePass());
 
   invokeScalarOptimizerLateEPCallbacks(FPM, Level);
@@ -1279,6 +1280,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
         SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     ExtraPasses.addPass(InstCombinePass());
     FPM.addPass(std::move(ExtraPasses));
+    FPM.addPass(LowerConditionalStoreIntrinsicPass());
   }
 
   // Now that we've formed fast to execute loop structures, we do further
@@ -1354,6 +1356,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                /*AllowSpeculation=*/true),
       /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+  FPM.addPass(LowerConditionalStoreIntrinsicPass());
 
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
@@ -1950,6 +1953,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                /*AllowSpeculation=*/true),
       /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+  MainFPM.addPass(LowerConditionalStoreIntrinsicPass());
 
   if (RunNewGVN)
     MainFPM.addPass(NewGVNPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 50682ca4970f1..04d01f7d7310e 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -385,6 +385,7 @@ FUNCTION_PASS("lower-allow-check", LowerAllowCheckPass())
 FUNCTION_PASS("lower-atomic", LowerAtomicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
+FUNCTION_PASS("lower-conditional-store", LowerConditionalStoreIntrinsicPass())
 FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
 FUNCTION_PASS("lower-invoke", LowerInvokePass())
 FUNCTION_PASS("lower-switch", LowerSwitchPass())
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 945ab5cf1f303..db8eb7a952335 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -619,6 +619,9 @@ void AArch64PassConfig::addIRPasses() {
     // Do loop invariant code motion in case part of the lowered result is
     // invariant.
     addPass(createLICMPass());
+    // This pass expands conditional store intrinsics,
+    //  which are not supported in the target
+    addPass(createLowerConditionalStoreIntrinsicPass());
   }
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..a865c81d430e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1046,8 +1046,12 @@ void AMDGPUPassConfig::addIRPasses() {
 
     // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
     // have expanded.
-    if (TM.getOptLevel() > CodeGenOptLevel::Less)
+    if (TM.getOptLevel() > CodeGenOptLevel::Less) {
       addPass(createLICMPass());
+      // This pass expands conditional store intrinsics,
+      //  which are not supported in the target
+      addPass(createLowerConditionalStoreIntrinsicPass());
+    }
   }
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 714cf69827a1e..3650f4c2121e6 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -485,6 +485,9 @@ void PPCPassConfig::addIRPasses() {
     // Do loop invariant code motion in case part of the lowered result is
     // invariant.
     addPass(createLICMPass());
+    // This pass expands conditional store intrinsics,
+    //  which are not supported in the target
+    addPass(createLowerConditionalStoreIntrinsicPass());
   }
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index ba09ebf8b04c4..bdecc839729c9 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -48,6 +48,7 @@ add_llvm_component_library(LLVMScalarOpts
   LoopUnrollAndJamPass.cpp
   LoopVersioningLICM.cpp
   LowerAtomicPass.cpp
+  LowerConditionalStoreIntrinsic.cpp
   LowerConstantIntrinsics.cpp
   LowerExpectIntrinsic.cpp
   LowerGuardIntrinsic.cpp
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 5eccf7b4adb65..009b5f3bb7bcc 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -157,6 +157,11 @@ cl::opt<unsigned> llvm::SetLicmMssaOptCap(
     cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
              "for faster compile. Caps the MemorySSA clobbering calls."));
 
+cl::opt<bool> SetLicmConditionalAccessPromotion(
+    "licm-conditional-access-promotion", cl::Hidden, cl::init(true),
+    cl::desc("Enable promotion of conditional accesses of loop-invariant"
+             " locations"));
+
 // Experimentally, memory promotion carries less importance than sinking and
 // hoisting. Limit when we do promotion when using MemorySSA, in order to save
 // compile time.
@@ -1819,6 +1824,12 @@ class LoopPromoter : public LoadAndStorePromoter {
   AAMDNodes AATags;
   ICFLoopSafetyInfo &SafetyInfo;
   bool CanInsertStoresInExitBlocks;
+  // This flag will be used to make sure that every sinken, conditional store
+  // instruction is executed conditionally within the exit blocks. In the
+  // preheader, it is initialized to 0. In every basic block containing a
+  // conditional store it is raised.
+  bool ConditionalAccessShouldBePromoted;
+  SSAUpdater &FlagSSAUpdater;
   ArrayRef<const Instruction *> Uses;
 
   // We're about to add a use of V in a loop exit block.  Insert an LCSSA phi
@@ -1839,6 +1850,17 @@ class LoopPromoter : public LoadAndStorePromoter {
     return PN;
   }
 
+  void promoteConditionalAccess(BasicBlock *ExitBlock, Value *LiveInValue,
+                                Value *PtrToExitBB,
+                                BasicBlock::iterator InsertPos) {
+    Value *FlagValue = FlagSSAUpdater.GetValueInMiddleOfBlock(ExitBlock);
+    IRBuilder<> Builder(&*InsertPos);
+    Type *DataType = LiveInValue->getType();
+    Value *Ptr = Builder.CreatePointerCast(PtrToExitBB,
+                                           PointerType::getUnqual(DataType));
+    Builder.CreateConditionalStore(LiveInValue, Ptr, Alignment, FlagValue);
+  }
+
 public:
   LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
                SmallVectorImpl<BasicBlock *> &LEB,
@@ -1846,13 +1868,17 @@ class LoopPromoter : public LoadAndStorePromoter {
                SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
                MemorySSAUpdater &MSSAU, LoopInfo &li, DebugLoc dl,
                Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags,
-               ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks)
+               ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks,
+               bool ConditionalAccessShouldBePromoted,
+               SSAUpdater &FlagSSAUpdater)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), LoopExitBlocks(LEB),
         LoopInsertPts(LIP), MSSAInsertPts(MSSAIP), PredCache(PIC), MSSAU(MSSAU),
         LI(li), DL(std::move(dl)), Alignment(Alignment),
         UnorderedAtomic(UnorderedAtomic), AATags(AATags),
         SafetyInfo(SafetyInfo),
-        CanInsertStoresInExitBlocks(CanInsertStoresInExitBlocks), Uses(Insts) {}
+        CanInsertStoresInExitBlocks(CanInsertStoresInExitBlocks),
+        ConditionalAccessShouldBePromoted(ConditionalAccessShouldBePromoted),
+        FlagSSAUpdater(FlagSSAUpdater), Uses(Insts) {}
 
   void insertStoresInLoopExitBlocks() {
     // Insert stores after in the loop exit blocks.  Each exit block gets a
@@ -1866,6 +1892,10 @@ class LoopPromoter : public LoadAndStorePromoter {
       LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
       Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
       BasicBlock::iterator InsertPos = LoopInsertPts[i];
+      if (ConditionalAccessShouldBePromoted) {
+        promoteConditionalAccess(ExitBlock, LiveInValue, Ptr, InsertPos);
+        continue;
+      }
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
       if (UnorderedAtomic)
         NewSI->setOrdering(AtomicOrdering::Unordered);
@@ -2042,6 +2072,9 @@ bool llvm::promoteLoopAccessesToScalars(
   bool SawNotAtomic = false;
   AAMDNodes AATags;
 
+  bool SawConditionalLIStore = false;
+  StringRef PointerOperandName;
+
   const DataLayout &MDL = Preheader->getModule()->getDataLayout();
 
   // If there are reads outside the promoted set, then promoting stores is
@@ -2119,6 +2152,12 @@ bool llvm::promoteLoopAccessesToScalars(
           if (StoreSafety == StoreSafetyUnknown)
             StoreSafety = StoreSafe;
           Alignment = std::max(Alignment, InstAlignment);
+        } else if (SetLicmConditionalAccessPromotion &&
+                   (!SawConditionalLIStore || (InstAlignment > Alignment))) {
+          SawConditionalLIStore = true;
+          if (PointerOperandName.empty())
+            PointerOperandName = Store->getPointerOperand()->getName();
+          Alignment = std::max(Alignment, InstAlignment);
         }
 
         // If a store dominates all exit blocks, it is safe to sink.
@@ -2199,6 +2238,29 @@ bool llvm::promoteLoopAccessesToScalars(
     // If we cannot hoist the load either, give up.
     return false;
 
+  const bool PromoteConditionalAccesses =
+      SetLicmConditionalAccessPromotion && SawConditionalLIStore;
+  bool ConditionalAccessShouldBePromoted = false;
+  SmallVector<PHINode *, 16> FlagPHIs;
+  SSAUpdater FlagSSAUpdater(&FlagPHIs);
+  if (StoreSafety == StoreSafetyUnknown && PromoteConditionalAccesses) {
+    ConditionalAccessShouldBePromoted = true;
+    // If we are allowed to promote conditional stores, store promotion is safe
+    StoreSafety = StoreSafe;
+    Type *Int1Ty = Type::getInt1Ty(Preheader->getParent()->getContext());
+    FlagSSAUpdater.Initialize(Int1Ty, PointerOperandName.str() + ".flag");
+    // Initialize the flag with 0 in the preheader.
+    FlagSSAUpdater.AddAvailableValue(Preheader,
+                                     ConstantInt::get(Int1Ty,
+                                                      /* Value */ 0));
+    for (auto *UI : LoopUses)
+      if (StoreInst *ConditionalLIStore = dyn_cast<StoreInst>(UI))
+        // Raise the flag if a conditional store happened.
+        FlagSSAUpdater.AddAvailableValue(ConditionalLIStore->getParent(),
+                                         ConstantInt::get(Int1Ty,
+                                                          /* Value */ 1));
+  }
+
   // Lets do the promotion!
   if (StoreSafety == StoreSafe) {
     LLVM_DEBUG(dbgs() << "LICM: Promoting load/store of the value: " << *SomePtr
@@ -2228,7 +2290,8 @@ bool llvm::promoteLoopAccessesToScalars(
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, ExitBlocks, InsertPts,
                         MSSAInsertPts, PIC, MSSAU, *LI, DL, Alignment,
                         SawUnorderedAtomic, AATags, *SafetyInfo,
-                        StoreSafety == StoreSafe);
+                        StoreSafety == StoreSafe,
+                        ConditionalAccessShouldBePromoted, FlagSSAUpdater);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
diff --git a/llvm/lib/Transforms/Scalar/LowerConditionalStoreIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerConditionalStoreIntrinsic.cpp
new file mode 100644
index 0000000000000..1a69e053e11d2
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LowerConditionalStoreIntrinsic.cpp
@@ -0,0 +1,115 @@
+//===- LowerConditionalStoreIntrinsic.cpp -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerConditionalStoreIntrinsic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-cond-store-intrinsic"
+
+// Conditional store intrinsic removal:
+// block:
+//  llvm.conditional.store.*(Val, Ptr, Condition)
+//                         |
+//                         V
+// block:
+//  br %i1 Condition, label cond.store, label block.remaining
+// cond.tore:
+//  store * Val, Ptr
+//  br label block.remaining
+// block.remaining:
+
+static bool isCondStoreIntr(Instruction &Instr) {
+  CallInst *CI = dyn_cast<CallInst>(&Instr);
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (Fn && Fn->getIntrinsicID() == Intrinsic::conditional_store)
+    return true;
+  return false;
+}
+
+static void lowerCondStoreIntr(Instruction &Instr, BasicBlock &BB) {
+  LLVM_DEBUG(dbgs() << "Found basic with conditional store: " << BB.getName()
+                    << "\n");
+  auto *Val = Instr.getOperand(0);
+  auto *Ptr = Instr.getOperand(1);
+  auto *AlignmentVal = dyn_cast<ConstantInt>(Instr.getOperand(2));
+  auto Alignment = MaybeAlign(AlignmentVal->getValue().getLimitedValue());
+  assert(AlignmentVal && "Invalid intrinsic operands");
+  auto *Cond = Instr.getOperand(3);
+
+  Instruction *ThenBlock =
+      SplitBlockAndInsertIfThen(Cond, &Instr, /*Unreachable*/ false);
+
+  IRBuilder<> IB(ThenBlock);
+  IB.CreateAlignedStore(Val, Ptr, Alignment);
+
+  Instr.eraseFromParent();
+}
+
+static bool lowerCondStoreIntrinsicForFunc(Function &F) {
+  bool Changed = false;
+
+  for (BasicBlock &BB : F)
+    for (Instruction &Instr : llvm::make_early_inc_range(llvm::reverse(BB)))
+      if (isCondStoreIntr(Instr)) {
+        lowerCondStoreIntr(Instr, BB);
+        Changed = true;
+      }
+  return Changed;
+}
+
+PreservedAnalyses
+LowerConditionalStoreIntrinsicPass::run(Function &F,
+                                        FunctionAnalysisManager &) {
+  if (lowerCondStoreIntrinsicForFunc(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+class LowerConditionalStoreIntrinsicLegacy : public FunctionPass {
+  LowerConditionalStoreIntrinsicPass Impl;
+
+public:
+  static char ID;
+  LowerConditionalStoreIntrinsicLegacy() : FunctionPass(ID) {
+    initializeLowerConditionalStoreIntrinsicLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    // Don't skip optnone functions; atomics still need to be lowered.
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
+};
+} // namespace
+
+char LowerConditionalStoreIntrinsicLegacy::ID = 0;
+INITIALIZE_PASS(LowerConditionalStoreIntrinsicLegacy, "lower-conditional-store",
+                "Lower conditional store", false, false)
+
+FunctionPass *llvm::createLowerConditionalStoreIntrinsicPass() {
+  return new LowerConditionalStoreIntrinsicLegacy();
+}
\ No newline at end of file
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index 400b15284c1b8..420f45bf4a570 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -33,6 +33,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopUnrollPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
   initializeLowerConstantIntrinsicsPass(Registry);
+  initializeLowerConditionalStoreIntrinsicLegacyPass(Registry);
   initializeMergeICmpsLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
diff --git a/llvm/test/Transforms/LICM/conditional-store-intrinsic.ll b/llvm/test/Transforms/LICM/conditional-store-intrinsic.ll
new file mode 100644
index 0000000000000..7fe111eeb641d
--- /dev/null
+++ b/llvm/test/Transforms/LICM/conditional-store-intrinsic.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -passes=lower-conditional-store < %s | FileCheck %s
+define void @foo(ptr %p, i64 %val, i1 %cond) {
+  call void @llvm.conditional.store.i64.p0(i64 %val, ptr %p, i32 4, i1 %cond)
+  call void @llvm.conditional.store.i64.p0(i64 %val, ptr %p, i32 4, i1 %cond)
+  ret void
+} 
+
+declare void @llvm.conditional.store.i64.p0(i64, ptr nocapture, i32 immarg, i1)
+
+; CHECK: define void @foo(ptr %p, i64 %val, i1 %cond) {
+; CHECK:   br i1 %cond, label %1, label %2
+; CHECK: 1: 
+; CHECK:   store i64 %val, ptr %p, align 4
+; CHECK:   br label %2
+; CHECK: 2:
+; CHECK:   br i1 %cond, label %3, label %4
+; CHECK: 3:
+; CHECK:   store i64 %val, ptr %p, align 4
+; CHECK:   br label %4
+; CHECK: 4:
+; CHECK:   ret void
+; CHECK: }
\ No newline at end of file
diff --git a/llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll b/llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll
index b694ea102ff09..809565ab7fe2b 100644
--- a/llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll
+++ b/llvm/test/Transforms/LICM/conditional-store-promotion-possibility.ll
@@ -1,4 +1,5 @@
-; RUN: opt -S -passes=licm < %s | FileCheck %s
+; RUN: opt -S -passes=licm -licm-conditional-access-promotion=true < %s > %t
+; RUN: opt -S -passes=lower-conditional-store < %t | FileCheck %s
 @res = dso_local local_unnamed_addr global i32 0, align 4
 
 define dso_local void @test(ptr noalias nocapture noundef readonly %a, i32 noundef signext %N) local_unnamed_addr #0 {
@@ -39,6 +40,7 @@ define dso_local void @test(ptr noalias nocapture noundef readonly %a, i32 nound
 ; CHECK:    br label %for.cond
 
 ; CHECK:  for.cond:
+; CHECK:    %res.flag4 = phi i1 [ false, %entry ], [ %res.flag, %for.inc ]
 ; CHECK:    %inc3 = phi i32 [ %res.promoted, %entry ], [ %inc2, %for.inc ]
 ; CHECK:    %i.0 = phi i32 [ 0, %entry ], [ %inc1, %for.inc ]
 ; CHECK:    %cmp = icmp slt i32 %i.0, %N
@@ -53,14 +55,22 @@ define dso_local void @test(ptr noalias nocapture noundef readonly %a, i32 nound
 
 ; CHECK:  if.then:
 ; CHECK:    %inc = add nsw i32 %inc3, 1
-; CHECK:    store i32 %inc, ptr @res, align 4
 ; CHECK:    br label %for.inc
 
 ; CHECK:  for.inc:
+; CHECK:    %res.flag = phi i1 [ true, %if.then ], [ %res.flag4, %for.body ]
 ; CHECK:    %inc2 = phi i32 [ %inc, %if.then ], [ %inc3, %for.body ]
 ; CHECK:    %inc1 = add nuw nsw i32 %i.0, 1
 ; CHECK:    br label %for.cond
 
 ; CHECK:  for.cond.cleanup:
-  ; CHECK:    ret void
-; CHECK:  }
\ No newline at end of file
+; CHECK:    %res.flag4.lcssa = phi i1 [ %res.flag4, %for.cond ]
+; CHECK:    %inc3.lcssa = phi i32 [ %inc3, %for.cond ]
+; CHECK:    br i1 %res.flag4.lcssa, label %1, label %2
+
+; CHECK:  1:
+; CHECK:    store i32 %inc3.lcssa, ptr @res, align 4
+; CHECK:    br label %2
+
+; CHECK:  2:
+; CHECK:    ret void
\ No newline at end of file
diff --git a/llvm/test/Transforms/LICM/promote-conditional-store-intr.ll b/llvm/test/Transforms/LICM/promote-conditional-store-intr.ll
new file mode 100644
index 0000000000000..f805b6b674e2e
--- /dev/null
+++ b/llvm/test/Transforms/LICM/promote-conditional-store-intr.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -passes=licm -licm-conditional-access-promotion=true < %s | FileCheck %s
+ at res = dso_local local_unnamed_addr global i32 0, align 4
+
+define dso_local void @test(ptr noalias nocapture noundef readonly %a, i32 noundef signext %N) local_unnamed_addr #0 {
+  ; Preheader:
+  entry:
+    br label %for.cond
+
+  ; Loop:
+  for.cond:                                         ; preds = %for.inc, %entry
+    %i.0 = phi i32 [ 0, %entry ], [ %inc1, %for.inc ]
+    %cmp = icmp slt i32 %i.0, %N
+    br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+  for.body:                                         ; preds = %for.cond
+    %idxprom = zext i32 %i.0 to i64
+    %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+    %0 = load i32, ptr %arrayidx, align 4
+    %tobool.not = icmp eq i32 %0, 0
+    br i1 %tobool.not, label %for.inc, label %if.then
+
+  if.then:                                          ; preds = %for.body
+    %1 = load i32, ptr @res, align 4
+    %inc = add nsw i32 %1, 1
+    store i32 %inc, ptr @res, align 4
+    br label %for.inc
+
+  for.inc:                                          ; preds = %for.body, %if.then
+    %inc1 = add nuw nsw i32 %i.0, 1
+    br label %for.cond 
+
+  ; Exit blocks
+  for.cond.cleanup:                                 ; preds = %for.cond
+    ret void
+}
+
+; CHECK: entry:
+; CHECK:   %res.promoted = load i32, ptr @res, align 4
+; CHECK:   br label %for.cond
+ 
+; CHECK: for.cond:
+; CHECK:   %res.flag4 = phi i1 [ false, %entry ], [ %res.flag, %for.inc ]
+; CHECK:   %inc3 = phi i32 [ %res.promoted, %entry ], [ %inc2, %for.inc ]
+; CHECK:   %i.0 = phi i32 [ 0, %entry ], [ %inc1, %for.inc ]
+; CHECK:   %cmp = icmp slt i32 %i.0, %N
+; CHECK:   br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+; CHECK: for.body:
+; CHECK:   %idxprom = zext i32 %i.0 to i64
+; CHECK:   %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+; CHECK:   %0 = load i32, ptr %arrayidx, align 4
+; CHECK:   %tobool.not = icmp eq i32 %0, 0
+; CHECK:   br i1 %tobool.not, label %for.inc, label %if.then
+
+; CHECK: if.then:
+; CHECK:   %inc = add nsw i32 %inc3, 1
+; CHECK:   br label %for.inc
+
+; CHECK: for.inc:
+; CHECK:   %res.flag = phi i1 [ true, %if.then ], [ %res.flag4, %for.body ]
+; CHECK:   %inc2 = phi i32 [ %inc, %if.then ], [ %inc3, %for.body ]
+; CHECK:   %inc1 = add nuw nsw i32 %i.0, 1
+; CHECK:   br label %for.cond
+
+; CHECK: for.cond.cleanup:
+; CHECK:   %res.flag4.lcssa = phi i1 [ %res.flag4, %for.cond ]
+; CHECK:   %inc3.lcssa = phi i32 [ %inc3, %for.cond ]
+; CHECK:   call void @llvm.conditional.store.i32.p0(i32 %inc3.lcssa, ptr @res, i32 4, i1 %res.flag4.lcssa)
+; CHECK:   ret void
\ No newline at end of file



More information about the llvm-commits mailing list