[llvm] [X86][APX] Convert store(cmov(load(x), y), x) to cstore(y, x) (PR #118946)

Fri Dec 6 18:53:11 PST 2024

https://github.com/phoebewang updated https://github.com/llvm/llvm-project/pull/118946

>From e24f31eaa72a46da93bc0718ced2afb9180fa36f Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Fri, 6 Dec 2024 17:39:14 +0800
Subject: [PATCH 1/4] [X86][APX] Convert store(cmov(load(x), y), x) to
 cstore(y, x)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 54 +++++++++++++++++++++++++
 llvm/test/CodeGen/X86/apx/cfcmov.ll     | 41 +++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2d3bb2ece621e0..7c3d0ded4fce7f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2927,6 +2927,24 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   }
 }
 
+static X86::CondCode getInvertedX86CC(X86::CondCode CC) {
+  switch (CC) {
+    // clang-format off
+  default: llvm_unreachable("Invalid integer condition!");
+  case X86::COND_E  : return X86::COND_NE;
+  case X86::COND_G  : return X86::COND_LE;
+  case X86::COND_GE : return X86::COND_L;
+  case X86::COND_L  : return X86::COND_GE;
+  case X86::COND_LE : return X86::COND_G;
+  case X86::COND_NE : return X86::COND_E;
+  case X86::COND_B  : return X86::COND_AE;
+  case X86::COND_A  : return X86::COND_BE;
+  case X86::COND_BE : return X86::COND_A;
+  case X86::COND_AE : return X86::COND_B;
+    // clang-format on
+  }
+}
+
 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
 /// condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
@@ -52786,6 +52804,42 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Convert store(cmov(load(x), y), x) to cstore(y, x).
+  if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+      Subtarget.hasCF() && St->isSimple()) {
+    SDValue Cmov;
+    if (StoredVal.getOpcode() == X86ISD::CMOV)
+      Cmov = StoredVal;
+    else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
+             StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
+      Cmov = StoredVal.getOperand(0);
+    else
+      return SDValue();
+
+    auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
+    if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
+      return SDValue();
+
+    bool InvertCC = false;
+    SDValue V = SDValue(Ld, 0);
+    if (V == Cmov.getOperand(1))
+      InvertCC = true;
+    else if (V != Cmov.getOperand(0))
+      return SDValue();
+
+    SDVTList Tys = DAG.getVTList(MVT::Other);
+    SDValue CC = Cmov.getOperand(2);
+    SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
+    if (InvertCC)
+      CC = DAG.getTargetConstant(
+          getInvertedX86CC((X86::CondCode)Cmov.getConstantOperandVal(2)), dl,
+          MVT::i8);
+    SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
+                     Cmov.getOperand(3)};
+    return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
+                                   St->getMemOperand());
+  }
+
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
diff --git a/llvm/test/CodeGen/X86/apx/cfcmov.ll b/llvm/test/CodeGen/X86/apx/cfcmov.ll
index f643120c9b50ff..22cd9534c9aafd 100644
--- a/llvm/test/CodeGen/X86/apx/cfcmov.ll
+++ b/llvm/test/CodeGen/X86/apx/cfcmov.ll
@@ -93,3 +93,44 @@ define i64 @cfcmov64rr_inv(i64 %0) {
   %3 = select i1 %2, i64 0, i64 %0
   ret i64 %3
 }
+
+define void @cfcmov16mr(ptr %p, i16 %0) {
+; CHECK-LABEL: cfcmov16mr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    cmpw %ax, %si
+; CHECK-NEXT:    cfcmovlew %si, (%rdi)
+; CHECK-NEXT:    retq
+  %2 = load i16, ptr %p, align 2
+  %3 = icmp sgt i16 %0, %2
+  %4 = select i1 %3, i16 %2, i16 %0
+  store i16 %4, ptr %p, align 2
+  ret void
+}
+
+define void @cfcmov32mr(ptr %p, i32 %0) {
+; CHECK-LABEL: cfcmov32mr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl (%rdi), %esi
+; CHECK-NEXT:    cfcmovgl %esi, (%rdi)
+; CHECK-NEXT:    retq
+  %2 = load i32, ptr %p, align 4
+  %3 = call i32 @llvm.smax.i32(i32 %0, i32 %2)
+  store i32 %3, ptr %p, align 4
+  ret void
+}
+
+define void @cfcmov64mr(ptr %p, i64 %0) {
+; CHECK-LABEL: cfcmov64mr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpq (%rdi), %rsi
+; CHECK-NEXT:    cfcmovgq %rsi, (%rdi)
+; CHECK-NEXT:    retq
+  %2 = load i64, ptr %p, align 2
+  %3 = icmp sgt i64 %0, %2
+  %4 = select i1 %3, i64 %0, i64 %2
+  store i64 %4, ptr %p, align 2
+  ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)

>From 405c4f02efa9afb2ab3e68a4d70c0e8cbfcc33b8 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Fri, 6 Dec 2024 20:37:01 +0800
Subject: [PATCH 2/4] Use GetOppositeBranchCondition

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7c3d0ded4fce7f..013395e93b5767 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2927,24 +2927,6 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   }
 }
 
-static X86::CondCode getInvertedX86CC(X86::CondCode CC) {
-  switch (CC) {
-    // clang-format off
-  default: llvm_unreachable("Invalid integer condition!");
-  case X86::COND_E  : return X86::COND_NE;
-  case X86::COND_G  : return X86::COND_LE;
-  case X86::COND_GE : return X86::COND_L;
-  case X86::COND_L  : return X86::COND_GE;
-  case X86::COND_LE : return X86::COND_G;
-  case X86::COND_NE : return X86::COND_E;
-  case X86::COND_B  : return X86::COND_AE;
-  case X86::COND_A  : return X86::COND_BE;
-  case X86::COND_BE : return X86::COND_A;
-  case X86::COND_AE : return X86::COND_B;
-    // clang-format on
-  }
-}
-
 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
 /// condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
@@ -52832,8 +52814,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
     if (InvertCC)
       CC = DAG.getTargetConstant(
-          getInvertedX86CC((X86::CondCode)Cmov.getConstantOperandVal(2)), dl,
-          MVT::i8);
+          GetOppositeBranchCondition(
+              (X86::CondCode)Cmov.getConstantOperandVal(2)),
+          dl, MVT::i8);
     SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
                      Cmov.getOperand(3)};
     return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,

>From afd24c69918c28ddd1af41f99daa5ec0baf8d164 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Fri, 6 Dec 2024 22:29:48 +0800
Subject: [PATCH 3/4] Address review comments

---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  3 ++-
 llvm/test/CodeGen/X86/apx/cfcmov.ll     | 34 +++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 013395e93b5767..7c752917fc437f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52786,7 +52786,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Convert store(cmov(load(x), y), x) to cstore(y, x).
+  // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
+  //         store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
   if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
       Subtarget.hasCF() && St->isSimple()) {
     SDValue Cmov;
diff --git a/llvm/test/CodeGen/X86/apx/cfcmov.ll b/llvm/test/CodeGen/X86/apx/cfcmov.ll
index 22cd9534c9aafd..263531adf408b4 100644
--- a/llvm/test/CodeGen/X86/apx/cfcmov.ll
+++ b/llvm/test/CodeGen/X86/apx/cfcmov.ll
@@ -126,11 +126,41 @@ define void @cfcmov64mr(ptr %p, i64 %0) {
 ; CHECK-NEXT:    cmpq (%rdi), %rsi
 ; CHECK-NEXT:    cfcmovgq %rsi, (%rdi)
 ; CHECK-NEXT:    retq
-  %2 = load i64, ptr %p, align 2
+  %2 = load i64, ptr %p, align 8
   %3 = icmp sgt i64 %0, %2
   %4 = select i1 %3, i64 %0, i64 %2
-  store i64 %4, ptr %p, align 2
+  store i64 %4, ptr %p, align 8
+  ret void
+}
+
+define void @volatileload(ptr %p, i32 %0) {
+; CHECK-LABEL: volatileload:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    cmpl %eax, %esi
+; CHECK-NEXT:    cmovbl %esi, %eax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    retq
+  %2 = load volatile i32, ptr %p, align 4
+  %3 = call i32 @llvm.umin.i32(i32 %0, i32 %2)
+  store i32 %3, ptr %p, align 4
+  ret void
+}
+
+define void @atomicstore(ptr %p, i64 %0) {
+; CHECK-LABEL: atomicstore:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    cmpq %rax, %rsi
+; CHECK-NEXT:    cmovaq %rsi, %rax
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  %2 = load i64, ptr %p, align 8
+  %3 = icmp ugt i64 %0, %2
+  %4 = select i1 %3, i64 %0, i64 %2
+  store atomic i64 %4, ptr %p unordered, align 8
   ret void
 }
 
 declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)

>From 1b8b82d9ddd28226782bfd4a8598a7fabeea5b2c Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Sat, 7 Dec 2024 10:52:54 +0800
Subject: [PATCH 4/4] Add test for different ptr

---
 llvm/test/CodeGen/X86/apx/cfcmov.ll | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llvm/test/CodeGen/X86/apx/cfcmov.ll b/llvm/test/CodeGen/X86/apx/cfcmov.ll
index 263531adf408b4..37ba3d451c2b16 100644
--- a/llvm/test/CodeGen/X86/apx/cfcmov.ll
+++ b/llvm/test/CodeGen/X86/apx/cfcmov.ll
@@ -162,5 +162,22 @@ define void @atomicstore(ptr %p, i64 %0) {
   ret void
 }
 
+define void @loadstorediffptr(ptr %p, i32 %0) {
+; CHECK-LABEL: loadstorediffptr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    cmpl %eax, %esi
+; CHECK-NEXT:    cmovbel %eax, %esi
+; CHECK-NEXT:    movl %esi, 4(%rdi)
+; CHECK-NEXT:    retq
+  %2 = getelementptr [2 x i32], ptr %p, i32 0, i32 0
+  %3 = load i32, ptr %2, align 4
+  %4 = icmp ule i32 %0, %3
+  %5 = select i1 %4, i32 %3, i32 %0
+  %6 = getelementptr [2 x i32], ptr %p, i32 0, i32 1
+  store i32 %5, ptr %6, align 4
+  ret void
+}
+
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.umin.i32(i32, i32)