[llvm] [X86] Support hoisting load/store with conditional faulting (PR #95515)

Shengchen Kan via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 14 02:17:19 PDT 2024


https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/95515

>From e598cb1cf295e3ef843c94a4f4d15e073d51478a Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Wed, 5 Jun 2024 15:04:27 +0800
Subject: [PATCH] [X86] Support hoisting load/store with conditional faulting

1. Add TTI interface for conditional load/store
2. Hoist load/store from successors if the targets support conditional
   faulting
3. Lower masked load/store to CFCMOV
---
 .../llvm/Analysis/TargetTransformInfo.h       |  8 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  1 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  5 +
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 18 ++++
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |  1 +
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 92 ++++++++++++++++++-
 llvm/test/CodeGen/X86/apx/cf.ll               | 51 ++++++++++
 .../X86/hoist-load-store-with-cf.ll           | 34 +++++++
 8 files changed, 207 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/cf.ll
 create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a..37afda39a1c9c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1113,6 +1113,10 @@ class TargetTransformInfo {
   /// \return the number of registers in the target-provided register class.
   unsigned getNumberOfRegisters(unsigned ClassID) const;
 
+  /// \return true if the target supports load/store that enables fault
+  /// suppression of memory operands when the source condition is false.
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
+
   /// \return the target-provided register class ID for the provided type,
   /// accounting for type promotion and other type-legalization techniques that
   /// the target might apply. However, it specifically does not account for the
@@ -1956,6 +1960,7 @@ class TargetTransformInfo::Concept {
   virtual bool preferToKeepConstantsAttached(const Instruction &Inst,
                                              const Function &Fn) const = 0;
   virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
+  virtual bool hasConditionalFaultingLoadStoreForType(Type *Ty) const = 0;
   virtual unsigned getRegisterClassForType(bool Vector,
                                            Type *Ty = nullptr) const = 0;
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
@@ -2543,6 +2548,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getNumberOfRegisters(unsigned ClassID) const override {
     return Impl.getNumberOfRegisters(ClassID);
   }
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const override {
+    return Impl.hasConditionalFaultingLoadStoreForType(Ty);
+  }
   unsigned getRegisterClassForType(bool Vector,
                                    Type *Ty = nullptr) const override {
     return Impl.getRegisterClassForType(Vector, Ty);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43..a4aa836ed82d3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -457,6 +457,7 @@ class TargetTransformInfoImplBase {
   }
 
   unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; }
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const { return false; }
 
   unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
     return Vector ? 1 : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f..46936f266bf46 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -722,6 +722,11 @@ unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
   return TTIImpl->getNumberOfRegisters(ClassID);
 }
 
+bool TargetTransformInfo::hasConditionalFaultingLoadStoreForType(
+    Type *Ty) const {
+  return TTIImpl->hasConditionalFaultingLoadStoreForType(Ty);
+}
+
 unsigned TargetTransformInfo::getRegisterClassForType(bool Vector,
                                                       Type *Ty) const {
   return TTIImpl->getRegisterClassForType(Vector, Ty);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 74948778ccf85..25597f836eb4d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,6 +176,24 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   return 8;
 }
 
+bool X86TTIImpl::hasConditionalFaultingLoadStoreForType(Type *Ty) const {
+  // Conditional faulting is supported by CFCMOV, which only accepts
+  // 8/16/32/64-bit operands.
+  // NOTE: Though VMOVSS/VMOVSD suppresses memory fault with zero mask, it has
+  // performance penalty.
+  if (!ST->hasCF() || !Ty || !Ty->isIntegerTy())
+    return false;
+  switch (cast<IntegerType>(Ty)->getBitWidth()) {
+  default:
+    return false;
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+    return true;
+  }
+}
+
 TypeSize
 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index e14dc9fc09051..701648c6a2b3a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   /// @{
 
   unsigned getNumberOfRegisters(unsigned ClassID) const;
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 107c8bb6c027f..7be6ba1ecda7c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -131,6 +131,12 @@ static cl::opt<bool> HoistCondStores(
     "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores if an unconditional store precedes"));
 
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+    "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+    cl::init(true),
+    cl::desc("Hoist loads/stores if the target supports "
+             "conditional faulting"));
+
 static cl::opt<bool> MergeCondStores(
     "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores even if an unconditional store does not "
@@ -275,6 +281,7 @@ class SimplifyCFGOpt {
   bool hoistSuccIdenticalTerminatorToSwitchOrIf(
       Instruction *TI, Instruction *I1,
       SmallVectorImpl<Instruction *> &OtherSuccTIs);
+  bool hoistLoadStoreWithCondFaultingFromSuccessors(BasicBlock *BB);
   bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
   bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2977,6 +2984,79 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
   return HaveRewritablePHIs;
 }
 
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     store i32 1, ptr %q, align 4
+///     ...
+///   TrueBB:
+///     %0 = load i32, ptr %b, align 4
+///     store i32 %0, ptr %p, align 4
+///     ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     %0 = cload i32, ptr %b, %cond
+///     cstore i32 %0, ptr %p, %cond
+///     cstore i32 1, ptr %q, ~%cond
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     ...
+///   TrueBB:
+///     ...
+/// \endcode
+///
+/// where cload/cstore is represented by llvm.masked.load/store, e.g.
+///
+/// \code
+///   %vcond = insertelement <1 x i1> undef, i1 %cond, i32 0
+///   %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+///                         (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> undef)
+///   %0 = bitcast <1 x i32> %v0 to i32 call void
+///   @llvm.masked.store.v1i32.p0(<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+///   %cond.not = xor i1 %cond, true
+///   %vcond.not = insertelement <1 x i1> undef, i1 %cond, i32 0
+///   call void @llvm.masked.store.v1i32.p0
+///              (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+///   are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+    BasicBlock *BB) {
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *IfTrueBB = BI->getSuccessor(0);
+  BasicBlock *IfFalseBB = BI->getSuccessor(1);
+
+  // If either of the blocks has it's address taken, then we can't do this fold,
+  // because the code we'd hoist would no longer run when we jump into the block
+  // by it's address.
+  for (auto *Succ : {IfTrueBB, IfFalseBB})
+    if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+      return false;
+
+  return false;
+}
+
 /// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -7436,14 +7516,20 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     return requestResimplify();
 
   // We have a conditional branch to two blocks that are only reachable
-  // from BI.  We know that the condbr dominates the two blocks, so see if
-  // there is any identical code in the "then" and "else" blocks.  If so, we
-  // can hoist it up to the branching block.
+  // from BI.  We know that the condbr dominates the two blocks, so see
+  //
+  // * if there is any identical code in the "then" and "else" blocks.
+  // * if there is any different load/store in the "then" and "else" blocks.
+  //
+  // If so, we can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistCommon && hoistCommonCodeFromSuccessors(
                              BI->getParent(), !Options.HoistCommonInsts))
         return requestResimplify();
+      if (HoistLoadsStoresWithCondFaulting &&
+          hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+        return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
new file mode 100644
index 0000000000000..be89625d2c631
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+cf -verify-machineinstrs | FileCheck %s
+
+define void @test (i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    negb %al
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    # implicit-def: $r8d
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %else
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  .LBB0_4: # %else1
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB0_5
+; CHECK-NEXT:  .LBB0_6: # %else3
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1: # %cond.load
+; CHECK-NEXT:    movl (%rsi), %r8d
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  .LBB0_3: # %cond.store
+; CHECK-NEXT:    movl %r8d, (%rdx)
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jne .LBB0_6
+; CHECK-NEXT:  .LBB0_5: # %cond.store2
+; CHECK-NEXT:    movl $1, (%rcx)
+; CHECK-NEXT:    retq
+entry:
+  %cond = icmp eq i32 %a, 0
+  %vcond = insertelement <1 x i1> undef, i1 %cond, i32 0
+  %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0 (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> undef)
+  %0 = bitcast <1 x i32> %v0 to i32
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+  %cond.not = xor i1 %cond, true
+  %vcond.not = insertelement <1 x i1> undef, i1 %cond, i32 0
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr %q, i32 4, <1 x i1> %vcond.not)
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:                                          ; preds = %entry
+  br label %if.end
+
+if.true:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.true, %if.false
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
new file mode 100644
index 0000000000000..302ea7a4ccd6b
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+define void @test(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:                                          ; preds = %entry
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:                                          ; preds = %entry
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.true, %if.false
+  ret void
+}



More information about the llvm-commits mailing list