[llvm] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT operations. (PR #112647)

Tue May 13 02:16:55 PDT 2025

https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/112647

>From d4260d3ee4fbcd802a9adc46a086c2cbda3bd29e Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 09:59:03 +0800
Subject: [PATCH 01/11] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT
 operations.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 ++++++++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 236c373e70250..0b94fbf869fc8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3731,6 +3731,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   return DAG.getBuildVector(VT, DL, Args);
 }
 
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+  if (Op->isDivergent())
+    return false;
+
+  EVT VT = Op.getValueType();
+  return VT == MVT::i32 || VT == MVT::i64;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a42214865ccfd..3302cb533fc96 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,6 +99,8 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
+  bool hasAndNot(SDValue Y) const override;
+
 protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;

>From 890ffe780dfd3dedf1ccb887de2dc00d75587255 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 10:31:26 +0800
Subject: [PATCH 02/11] [AMDGPU] Update value name.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3302cb533fc96..f54954b678dac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,7 +99,7 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
-  bool hasAndNot(SDValue Y) const override;
+  bool hasAndNot(SDValue Op) const override;
 
 protected:
   bool shouldCombineMemoryType(EVT VT) const;

>From 6e24fd2cc0fb88f3b88d12daacfe69694149d6c2 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:07:09 +0800
Subject: [PATCH 03/11] [AMDGPU] Update patch.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 16 ++++++++--------
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  9 +++++++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b94fbf869fc8..e85597091bf97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3731,14 +3731,6 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   return DAG.getBuildVector(VT, DL, Args);
 }
 
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
-  if (Op->isDivergent())
-    return false;
-
-  EVT VT = Op.getValueType();
-  return VT == MVT::i32 || VT == MVT::i64;
-}
-
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
@@ -6097,3 +6089,11 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
                                                Register N0, Register N1) const {
   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
 }
+
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+  if (Op->isDivergent())
+    return false;
+
+  EVT VT = Op.getValueType();
+  return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f54954b678dac..1d327755171d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,8 +99,6 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
-  bool hasAndNot(SDValue Op) const override;
-
 protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -387,6 +385,13 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  /// Return true if the target supports a bitwise and-not operation:
+  /// X = ~A & B
+  /// This function checks if the operation can be directly mapped to the
+  /// target's native instructions, potentially simplifying select or other
+  /// related instructions by using more efficient hardware-specific operations.
+  bool hasAndNot(SDValue Op) const override;
 };
 
 namespace AMDGPUISD {

>From 30f40a6dfd148c415c18e27a9eb662d5c155269a Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:54:13 +0800
Subject: [PATCH 04/11] [AMDGPU] Move to SIISelLowering.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 --------
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   | 7 -------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 8 ++++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h       | 7 +++++++
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e85597091bf97..236c373e70250 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -6089,11 +6089,3 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
                                                Register N0, Register N1) const {
   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
 }
-
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
-  if (Op->isDivergent())
-    return false;
-
-  EVT VT = Op.getValueType();
-  return VT == MVT::i32 || VT == MVT::i64;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 1d327755171d1..a42214865ccfd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -385,13 +385,6 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
-
-  /// Return true if the target supports a bitwise and-not operation:
-  /// X = ~A & B
-  /// This function checks if the operation can be directly mapped to the
-  /// target's native instructions, potentially simplifying select or other
-  /// related instructions by using more efficient hardware-specific operations.
-  bool hasAndNot(SDValue Op) const override;
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c05ba42d999e9..cd7fbf0d796cc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17525,3 +17525,11 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   AI->eraseFromParent();
   return LI;
 }
+
+bool SITargetLowering::hasAndNot(SDValue Op) const {
+  if (Op->isDivergent())
+    return false;
+
+  EVT VT = Op.getValueType();
+  return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..484e0a221b4a8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -611,6 +611,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   MachineMemOperand::Flags
   getTargetMMOFlags(const Instruction &I) const override;
+
+  /// Return true if the target supports a bitwise and-not operation:
+  /// X = ~A & B
+  /// This function checks if the operation can be directly mapped to the
+  /// target's native instructions, potentially simplifying select or other
+  /// related instructions by using more efficient hardware-specific operations.
+  bool hasAndNot(SDValue Op) const override;
 };
 
 // Returns true if argument is a boolean value which is not serialized into

>From 0e68f59b4e375e635f2e913e97a9d891f5598e24 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 14:22:34 +0800
Subject: [PATCH 05/11] [AMDGPU] Update comments.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.h   | 6 ------
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cd7fbf0d796cc..8858252790fd8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17527,6 +17527,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 }
 
 bool SITargetLowering::hasAndNot(SDValue Op) const {
+  // Return false if the operation is divergent, as AND-NOT optimization
+  // requires uniform behavior across threads.
   if (Op->isDivergent())
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 484e0a221b4a8..b348702a5bd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -611,12 +611,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   MachineMemOperand::Flags
   getTargetMMOFlags(const Instruction &I) const override;
-
-  /// Return true if the target supports a bitwise and-not operation:
-  /// X = ~A & B
-  /// This function checks if the operation can be directly mapped to the
-  /// target's native instructions, potentially simplifying select or other
-  /// related instructions by using more efficient hardware-specific operations.
   bool hasAndNot(SDValue Op) const override;
 };
 

>From 8919a2144804006ffa408267e2a8077650c0fe43 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 13:21:18 +0800
Subject: [PATCH 06/11] [AMDGPU] Add a lit test for hasAndNot.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 86 +++++++++++++++++++++--
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |  1 +
 llvm/test/CodeGen/AMDGPU/andornot.ll      | 39 ++++++++++
 3 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8858252790fd8..d42ddc4612d53 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7029,6 +7029,81 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
   }
 }
 
+SDValue SITargetLowering::combineAnd(SDValue Op,
+                                                DAGCombinerInfo &DCI) const {
+  const unsigned Opc = Op.getOpcode();
+  assert(Opc == ISD::AND);
+
+  auto &DAG = DCI.DAG;
+  SDLoc DL(Op);
+
+  if(hasAndNot(Op)) {
+    SDValue LHS = Op->getOperand(0);
+    SDValue RHS = Op->getOperand(1);
+
+    // (and LHS, (or Y, ~Z))
+    if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
+      SDValue Y = RHS->getOperand(0);
+      SDValue NotZ = RHS->getOperand(1);
+
+      if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+        SDValue Z = NotZ->getOperand(0);
+
+        if (!isa<ConstantSDNode>(Y)) {
+          SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
+          SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+          SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+          SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+          return NewAnd;
+        }
+      }
+    }
+  }
+    
+  EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+                                 : Op->getOperand(0).getValueType();
+  auto ExtTy = OpTy.changeElementType(MVT::i32);
+
+  if (DCI.isBeforeLegalizeOps() ||
+      isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
+    return SDValue();
+
+  SDValue LHS;
+  SDValue RHS;
+  if (Opc == ISD::SELECT) {
+    LHS = Op->getOperand(1);
+    RHS = Op->getOperand(2);
+  } else {
+    LHS = Op->getOperand(0);
+    RHS = Op->getOperand(1);
+  }
+
+  const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
+  LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+
+  // Special case: for shifts, the RHS always needs a zext.
+  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+    RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
+  else
+    RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+
+  // setcc always return i1/i1 vec so no need to truncate after.
+  if (Opc == ISD::SETCC) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
+  }
+
+  // For other ops, we extend the operation's return type as well so we need to
+  // truncate back to the original type.
+  SDValue NewVal;
+  if (Opc == ISD::SELECT)
+    NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
+  else
+    NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
+
+  return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
+}
+
 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
                                                 DAGCombinerInfo &DCI) const {
   const unsigned Opc = Op.getOpcode();
@@ -15244,13 +15319,17 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
+  case ISD::AND:
+    if (auto Res = combineAnd(SDValue(N, 0), DCI))
+      return Res;
+    break;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:
-  case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::MUL:
@@ -15356,7 +15435,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CLAMP:
     return performClampCombine(N, DCI);
   case ISD::SCALAR_TO_VECTOR: {
-    SelectionDAG &DAG = DCI.DAG;
     EVT VT = N->getValueType(0);
 
     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -17527,8 +17605,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 }
 
 bool SITargetLowering::hasAndNot(SDValue Op) const {
-  // Return false if the operation is divergent, as AND-NOT optimization
-  // requires uniform behavior across threads.
+  // Return false if the operation is divergent, as AND-NOT is a scalar-only
+  // instruction.
   if (Op->isDivergent())
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index b348702a5bd8d..a418bae67ebc4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -148,6 +148,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
new file mode 100644
index 0000000000000..821709847ab8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andornot.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i16
+; GCN: s_not_b32
+; GCN-NEXT: s_lshr_b32
+; GCN-NEXT: s_and_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
+entry:
+  %not_z = xor i16 %z, -1
+  %or_y_not_z = or i16 %y, %not_z
+  %and_result = and i16 %x, %or_y_not_z
+  store i16 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i32
+; GCN: s_andn2_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %not_z = xor i32 %z, -1
+  %or_y_not_z = or i32 %y, %not_z
+  %and_result = and i32 %x, %or_y_not_z
+  store i32 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i64
+; GCN: s_andn2_b64
+; GCN-NEXT: s_andn2_b64
+define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
+entry:
+  %not_z = xor i64 %z, -1
+  %or_y_not_z = or i64 %y, %not_z
+  %and_result = and i64 %x, %or_y_not_z
+  store i64 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}

>From 92427f6f948768febcad43a78fdc91121ebd7a3e Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 13:37:51 +0800
Subject: [PATCH 07/11] [AMDGPU] Fix clang format issue.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d42ddc4612d53..f7df921f41da6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7029,15 +7029,14 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
   }
 }
 
-SDValue SITargetLowering::combineAnd(SDValue Op,
-                                                DAGCombinerInfo &DCI) const {
+SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const {
   const unsigned Opc = Op.getOpcode();
   assert(Opc == ISD::AND);
 
   auto &DAG = DCI.DAG;
   SDLoc DL(Op);
 
-  if(hasAndNot(Op)) {
+  if (hasAndNot(Op)) {
     SDValue LHS = Op->getOperand(0);
     SDValue RHS = Op->getOperand(1);
 
@@ -7046,20 +7045,24 @@ SDValue SITargetLowering::combineAnd(SDValue Op,
       SDValue Y = RHS->getOperand(0);
       SDValue NotZ = RHS->getOperand(1);
 
-      if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+      if (NotZ.getOpcode() == ISD::XOR &&
+          isAllOnesConstant(NotZ->getOperand(1))) {
         SDValue Z = NotZ->getOperand(0);
 
         if (!isa<ConstantSDNode>(Y)) {
           SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
-          SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
-          SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
-          SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+          SDValue AndNotYZ =
+              DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+          SDValue NotAndNotYZ =
+              DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+          SDValue NewAnd =
+              DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
           return NewAnd;
         }
       }
     }
   }
-    
+
   EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
                                  : Op->getOperand(0).getValueType();
   auto ExtTy = OpTy.changeElementType(MVT::i32);

>From b448b133f730a7a2d37b618f0f965da11b560012 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 14:27:15 +0800
Subject: [PATCH 08/11] [AMDGPU] Remove combineAnd.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 87 +----------------------
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |  1 -
 llvm/test/CodeGen/AMDGPU/andorn2.ll       | 44 ++++++++++++
 llvm/test/CodeGen/AMDGPU/andornot.ll      | 39 ----------
 4 files changed, 47 insertions(+), 124 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f7df921f41da6..e1082e878769f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7029,84 +7029,6 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
   }
 }
 
-SDValue SITargetLowering::combineAnd(SDValue Op, DAGCombinerInfo &DCI) const {
-  const unsigned Opc = Op.getOpcode();
-  assert(Opc == ISD::AND);
-
-  auto &DAG = DCI.DAG;
-  SDLoc DL(Op);
-
-  if (hasAndNot(Op)) {
-    SDValue LHS = Op->getOperand(0);
-    SDValue RHS = Op->getOperand(1);
-
-    // (and LHS, (or Y, ~Z))
-    if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
-      SDValue Y = RHS->getOperand(0);
-      SDValue NotZ = RHS->getOperand(1);
-
-      if (NotZ.getOpcode() == ISD::XOR &&
-          isAllOnesConstant(NotZ->getOperand(1))) {
-        SDValue Z = NotZ->getOperand(0);
-
-        if (!isa<ConstantSDNode>(Y)) {
-          SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
-          SDValue AndNotYZ =
-              DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
-          SDValue NotAndNotYZ =
-              DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
-          SDValue NewAnd =
-              DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
-          return NewAnd;
-        }
-      }
-    }
-  }
-
-  EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
-                                 : Op->getOperand(0).getValueType();
-  auto ExtTy = OpTy.changeElementType(MVT::i32);
-
-  if (DCI.isBeforeLegalizeOps() ||
-      isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
-    return SDValue();
-
-  SDValue LHS;
-  SDValue RHS;
-  if (Opc == ISD::SELECT) {
-    LHS = Op->getOperand(1);
-    RHS = Op->getOperand(2);
-  } else {
-    LHS = Op->getOperand(0);
-    RHS = Op->getOperand(1);
-  }
-
-  const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
-  LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
-
-  // Special case: for shifts, the RHS always needs a zext.
-  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
-    RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
-  else
-    RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
-
-  // setcc always return i1/i1 vec so no need to truncate after.
-  if (Opc == ISD::SETCC) {
-    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-    return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
-  }
-
-  // For other ops, we extend the operation's return type as well so we need to
-  // truncate back to the original type.
-  SDValue NewVal;
-  if (Opc == ISD::SELECT)
-    NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
-  else
-    NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
-
-  return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
-}
-
 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
                                                 DAGCombinerInfo &DCI) const {
   const unsigned Opc = Op.getOpcode();
@@ -15322,17 +15244,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
-  case ISD::AND:
-    if (auto Res = combineAnd(SDValue(N, 0), DCI))
-      return Res;
-    break;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:
+  case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::MUL:
@@ -15438,6 +15356,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CLAMP:
     return performClampCombine(N, DCI);
   case ISD::SCALAR_TO_VECTOR: {
+    SelectionDAG &DAG = DCI.DAG;
     EVT VT = N->getValueType(0);
 
     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -17610,7 +17529,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 bool SITargetLowering::hasAndNot(SDValue Op) const {
   // Return false if the operation is divergent, as AND-NOT is a scalar-only
   // instruction.
-  if (Op->isDivergent())
+  if (Op->isDivergent() || !Op->isMachineOpcode())
     return false;
 
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a418bae67ebc4..b348702a5bd8d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -148,7 +148,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 3226a77bb9d34..e1fdddf4438b6 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -25,6 +25,28 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
+; GCN: s_andn2_b32
+define amdgpu_kernel void @scalar_andn2_i32_one_sgpr(
+    ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+entry:
+  %nb = xor i32 %b, -1
+  %r0.val = and i32 %a, %nb
+  store i32 %r0.val, ptr addrspace(1) %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
+; GCN: s_andn2_b64
+define amdgpu_kernel void @scalar_andn2_i64_one_sgpr(
+    ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+entry:
+  %nb = xor i64 %b, -1
+  %r0.val = and i64 %a, %nb
+  store i64 %r0.val, ptr addrspace(1) %r0
+  ret void
+}
+
 ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
 ; GCN: s_orn2_b32
 define amdgpu_kernel void @scalar_orn2_i32_one_use(
@@ -47,6 +69,28 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
+; GCN: s_orn2_b32
+define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr(
+    ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+entry:
+  %nb = xor i32 %b, -1
+  %r0.val = or i32 %a, %nb
+  store i32 %r0.val, ptr addrspace(1) %r0
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
+; GCN: s_orn2_b64
+define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr(
+    ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+entry:
+  %nb = xor i64 %b, -1
+  %r0.val = or i64 %a, %nb
+  store i64 %r0.val, ptr addrspace(1) %r0
+  ret void
+}
+
 ; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
 ; GCN: v_not_b32
 ; GCN: v_and_b32
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
deleted file mode 100644
index 821709847ab8d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/andornot.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i16
-; GCN: s_not_b32
-; GCN-NEXT: s_lshr_b32
-; GCN-NEXT: s_and_b32
-; GCN-NEXT: s_andn2_b32
-define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
-entry:
-  %not_z = xor i16 %z, -1
-  %or_y_not_z = or i16 %y, %not_z
-  %and_result = and i16 %x, %or_y_not_z
-  store i16 %and_result, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i32
-; GCN: s_andn2_b32
-; GCN-NEXT: s_andn2_b32
-define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
-entry:
-  %not_z = xor i32 %z, -1
-  %or_y_not_z = or i32 %y, %not_z
-  %and_result = and i32 %x, %or_y_not_z
-  store i32 %and_result, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}scalar_and_or_not_i64
-; GCN: s_andn2_b64
-; GCN-NEXT: s_andn2_b64
-define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
-entry:
-  %not_z = xor i64 %z, -1
-  %or_y_not_z = or i64 %y, %not_z
-  %and_result = and i64 %x, %or_y_not_z
-  store i64 %and_result, ptr addrspace(1) %out, align 4
-  ret void
-}

>From f23fa09c31d06acfc4d2b11d42479506c519124d Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Mon, 21 Oct 2024 15:59:26 +0800
Subject: [PATCH 09/11] [AMDGPU] Update lit test.

---
 llvm/test/CodeGen/AMDGPU/andorn2.ll | 32 +++++++++++------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index e1fdddf4438b6..4fe7e21b2adea 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -27,24 +27,20 @@ entry:
 
 ; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
 ; GCN: s_andn2_b32
-define amdgpu_kernel void @scalar_andn2_i32_one_sgpr(
-    ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) {
 entry:
   %nb = xor i32 %b, -1
-  %r0.val = and i32 %a, %nb
-  store i32 %r0.val, ptr addrspace(1) %r0
-  ret void
+  %and = and i32 %a, %nb
+  ret i32 %and
 }
 
 ; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
 ; GCN: s_andn2_b64
-define amdgpu_kernel void @scalar_andn2_i64_one_sgpr(
-    ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) {
 entry:
   %nb = xor i64 %b, -1
-  %r0.val = and i64 %a, %nb
-  store i64 %r0.val, ptr addrspace(1) %r0
-  ret void
+  %and = and i64 %a, %nb
+  ret i64 %and
 }
 
 ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
@@ -71,24 +67,20 @@ entry:
 
 ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
 ; GCN: s_orn2_b32
-define amdgpu_kernel void @scalar_orn2_i32_one_use_sgpr(
-    ptr addrspace(1) %r0, i32 inreg %a, i32 inreg %b) {
+define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) {
 entry:
   %nb = xor i32 %b, -1
-  %r0.val = or i32 %a, %nb
-  store i32 %r0.val, ptr addrspace(1) %r0
-  ret void
+  %or = or i32 %a, %nb
+  ret i32 %or;
 }
 
 ; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
 ; GCN: s_orn2_b64
-define amdgpu_kernel void @scalar_orn2_i64_one_use_sgpr(
-    ptr addrspace(1) %r0, i64 inreg %a, i64 inreg %b) {
+define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) {
 entry:
   %nb = xor i64 %b, -1
-  %r0.val = or i64 %a, %nb
-  store i64 %r0.val, ptr addrspace(1) %r0
-  ret void
+  %or = or i64 %a, %nb
+  ret i64 %or;
 }
 
 ; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use

>From 307a27581aefe048a1c68e349d23d54ab216c14e Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Sun, 27 Apr 2025 18:03:24 +0800
Subject: [PATCH 10/11] [AMDGPU] Add unfold test.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   2 +-
 llvm/test/CodeGen/AMDGPU/andorn2.ll           |  36 -
 ...unfold-masked-merge-scalar-variablemask.ll | 764 ++++++++++++++++++
 3 files changed, 765 insertions(+), 37 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e1082e878769f..970ae28f39d0c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17529,7 +17529,7 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 bool SITargetLowering::hasAndNot(SDValue Op) const {
   // Return false if the operation is divergent, as AND-NOT is a scalar-only
   // instruction.
-  if (Op->isDivergent() || !Op->isMachineOpcode())
+  if (Op->isDivergent())
     return false;
 
   EVT VT = Op.getValueType();
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 4fe7e21b2adea..3226a77bb9d34 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -25,24 +25,6 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}scalar_andn2_i32_one_sgpr
-; GCN: s_andn2_b32
-define i32 @scalar_andn2_i32_one_sgpr(i32 inreg %a, i32 inreg %b) {
-entry:
-  %nb = xor i32 %b, -1
-  %and = and i32 %a, %nb
-  ret i32 %and
-}
-
-; GCN-LABEL: {{^}}scalar_andn2_i64_one_sgpr
-; GCN: s_andn2_b64
-define i64 @scalar_andn2_i64_one_sgpr(i64 inreg %a, i64 inreg %b) {
-entry:
-  %nb = xor i64 %b, -1
-  %and = and i64 %a, %nb
-  ret i64 %and
-}
-
 ; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
 ; GCN: s_orn2_b32
 define amdgpu_kernel void @scalar_orn2_i32_one_use(
@@ -65,24 +47,6 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}scalar_orn2_i32_one_use_sgpr
-; GCN: s_orn2_b32
-define i32 @scalar_orn2_i32_one_use_sgpr(i32 inreg %a, i32 inreg %b) {
-entry:
-  %nb = xor i32 %b, -1
-  %or = or i32 %a, %nb
-  ret i32 %or;
-}
-
-; GCN-LABEL: {{^}}scalar_orn2_i64_one_use_sgpr
-; GCN: s_orn2_b64
-define i64 @scalar_orn2_i64_one_use_sgpr(i64 inreg %a, i64 inreg %b) {
-entry:
-  %nb = xor i64 %b, -1
-  %or = or i64 %a, %nb
-  ret i64 %or;
-}
-
 ; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
 ; GCN: v_not_b32
 ; GCN: v_and_b32
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
new file mode 100644
index 0000000000000..6eeeb71006399
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -0,0 +1,764 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+define i32 @out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %mx = and i32 %x, %mask
+  %notmask = xor i32 %mask, -1
+  %my = and i32 %y, %notmask
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+define i64 @out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
+; GCN-LABEL: out64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
+; GCN-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %mx = and i64 %x, %mask
+  %notmask = xor i64 %mask, -1
+  %my = and i64 %y, %notmask
+  %r = or i64 %mx, %my
+  ret i64 %r
+}
+
+define i32 @in32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i64 @in64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
+; GCN-LABEL: in64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i64 %x, %y
+  %n1 = and i64 %n0, %mask
+  %r = xor i64 %n1, %y
+  ret i64 %r
+}
+; ============================================================================ ;
+; Commutativity tests.
+; ============================================================================ ;
+define i32 @in_commutativity_0_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_0_0_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %mask, %n0
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i32 @in_commutativity_0_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_0_1_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %y, %n1
+  ret i32 %r
+}
+
+define i32 @in_commutativity_0_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_0_1_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %mask, %n0
+  %r = xor i32 %y, %n1
+  ret i32 %r
+}
+
+define i32 @in_commutativity_1_0_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_0_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %x
+  ret i32 %r
+}
+
+define i32 @in_commutativity_1_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_0_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %mask, %n0
+  %r = xor i32 %n1, %x
+  ret i32 %r
+}
+
+define i32 @in_commutativity_1_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_1_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %x, %n1
+  ret i32 %r
+}
+
+define i32 @in_commutativity_1_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_commutativity_1_1_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %mask, %n0
+  %r = xor i32 %x, %n1
+  ret i32 %r
+}
+; ============================================================================ ;
+; Y is an 'and' too.
+; ============================================================================ ;
+define i32 @in_complex_y0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %mask) {
+; GCN-LABEL: in_complex_y0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s3
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s3
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %y = and i32 %y_hi, %y_low
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i32 @in_complex_y1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %mask) {
+; GCN-LABEL: in_complex_y1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s3
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s3
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %y = and i32 %y_hi, %y_low
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %y, %n1
+  ret i32 %r
+}
+; ============================================================================ ;
+; M is an 'xor' too.
+; ============================================================================ ;
+define i32 @in_complex_m0(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_m0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_b32 s2, s2, s3
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %mask = xor i32 %m_a, %m_b
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i32 @in_complex_m1(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_m1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_b32 s2, s2, s3
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %mask = xor i32 %m_a, %m_b
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %mask, %n0
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+; ============================================================================ ;
+; Both Y and M are complex.
+; ============================================================================ ;
+define i32 @in_complex_y0_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y0_m0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s2, s3, s16
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %y = and i32 %y_hi, %y_low
+  %mask = xor i32 %m_a, %m_b
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i32 @in_complex_y1_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y1_m0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s2, s3, s16
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %y = and i32 %y_hi, %y_low
+  %mask = xor i32 %m_a, %m_b
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %y, %n1
+  ret i32 %r
+}
+
+define i32 @in_complex_y0_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y0_m1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s2, s3, s16
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %y = and i32 %y_hi, %y_low
+  %mask = xor i32 %m_a, %m_b
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %mask, %n0
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i32 @in_complex_y1_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 inreg %m_a, i32 inreg %m_b) {
+; GCN-LABEL: in_complex_y1_m1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s2, s3, s16
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %y = and i32 %y_hi, %y_low
+  %mask = xor i32 %m_a, %m_b
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %mask, %n0
+  %r = xor i32 %y, %n1
+  ret i32 %r
+}
+; ============================================================================ ;
+; Various cases with %x and/or %y being a constant
+; ============================================================================ ;
+define i32 @out_constant_varx_mone(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_mone:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s2, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_not1_b32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %mask, %x
+  %my = and i32 %notmask, -1
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+define i32 @in_constant_varx_mone(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_mone:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_not_b32 s0, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_nand_b32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, -1
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, -1
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_varx_mone_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_mone_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %notmask, %x
+  %my = and i32 %mask, -1
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_varx_mone_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_mone_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_not_b32 s1, s2
+; GCN-NEXT:    s_not_b32 s0, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_nand_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %n0 = xor i32 %x, -1
+  %n1 = and i32 %n0, %notmask
+  %r = xor i32 %n1, -1
+  ret i32 %r
+}
+
+define i32 @out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_42:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s2, s0
+; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %mask, %x
+  %my = and i32 %notmask, 42
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+define i32 @in_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_42:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, 42
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, 42
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_varx_42_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s2, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %notmask, %x
+  %my = and i32 %mask, 42
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_varx_42_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s1, s2, 42
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %n0 = xor i32 %x, 42
+  %n1 = and i32 %n0, %notmask
+  %r = xor i32 %n1, 42
+  ret i32 %r
+}
+
+define i32 @out_constant_mone_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_mone_vary:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_b32 s0, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %mask, -1
+  %my = and i32 %notmask, %y
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+define i32 @in_constant_mone_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_mone_vary:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_b32 s0, s2, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 -1, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_mone_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_mone_vary_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s2, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_not1_b32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %notmask, -1
+  %my = and i32 %mask, %y
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_mone_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_mone_vary_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_not1_b32 s0, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %n0 = xor i32 -1, %y
+  %n1 = and i32 %n0, %notmask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i32 @out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_42_vary:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s2, 42
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %mask, 42
+  %my = and i32 %notmask, %y
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+define i32 @in_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_42_vary:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s0, s1, s2
+; GCN-NEXT:    s_and_b32 s1, s2, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 42, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: out_constant_42_vary_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_not1_b32 s0, 42, s2
+; GCN-NEXT:    s_and_b32 s1, s2, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %mx = and i32 %notmask, 42
+  %my = and i32 %mask, %y
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define i32 @in_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: in_constant_42_vary_invmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s1, s2
+; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %notmask = xor i32 %mask, -1
+  %n0 = xor i32 42, %y
+  %n1 = and i32 %n0, %notmask
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+; ============================================================================ ;
+; Negative tests. Should not be folded.
+; ============================================================================ ;
+; Multi-use tests.
+declare void @use32(i32) nounwind
+define i32 @in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg %mask) nounwind {
+; GCN-LABEL: in_multiuse_A:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s2, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b32 s16, -1
+; GCN-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b32 exec_lo, s16
+; GCN-NEXT:    v_writelane_b32 v40, s2, 4
+; GCN-NEXT:    s_add_i32 s32, s32, 16
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_load_b64 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    s_mov_b32 s34, s1
+; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    s_and_b32 s35, s0, s3
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    s_xor_b32 s0, s35, s34
+; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s0, v40, 4
+; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    s_mov_b32 s33, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  call void @use32(i32 %n1)
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+define i32 @in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg %mask) nounwind {
+; GCN-LABEL: in_multiuse_B:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s2, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b32 s16, -1
+; GCN-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b32 exec_lo, s16
+; GCN-NEXT:    s_add_i32 s32, s32, 16
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, use32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, use32 at gotpcrel32@hi+12
+; GCN-NEXT:    v_writelane_b32 v40, s2, 4
+; GCN-NEXT:    s_load_b64 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    s_mov_b32 s34, s1
+; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    s_and_b32 s35, s0, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    s_xor_b32 s0, s35, s34
+; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s0, v40, 4
+; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    s_mov_b32 s33, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  call void @use32(i32 %n0)
+  %r = xor i32 %n1, %y
+  ret i32 %r
+}
+
+; Various bad variants
+define i32 @n0_badmask(i32 inreg %x, i32 inreg %y, i32 inreg %mask, i32 inreg %mask2) {
+; GCN-LABEL: n0_badmask:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s3
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %mx = and i32 %x, %mask
+  %notmask = xor i32 %mask2, -1
+  %my = and i32 %y, %notmask
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+define i32 @n0_badxor(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
+; GCN-LABEL: n0_badxor:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_b32 s3, s2, 1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s1, s3
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %mx = and i32 %x, %mask
+  %notmask = xor i32 %mask, 1
+  %my = and i32 %y, %notmask
+  %r = or i32 %mx, %my
+  ret i32 %r
+}
+
+define i32 @n1_thirdvar(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg %mask) {
+; GCN-LABEL: n1_thirdvar:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_b32 s0, s0, s3
+; GCN-NEXT:    s_xor_b32 s0, s0, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %n0 = xor i32 %x, %y
+  %n1 = and i32 %n0, %mask
+  %r = xor i32 %n1, %z
+  ret i32 %r
+}

>From 5e6480140600702350d19c67aa24260aad45ca4b Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Sun, 27 Apr 2025 18:57:18 +0800
Subject: [PATCH 11/11] [AMDGPU] Update.

---
 llvm/test/CodeGen/AMDGPU/bfi_int.ll          | 109 ++-
 llvm/test/CodeGen/AMDGPU/commute-compares.ll | 866 +++++++++++++++++++
 2 files changed, 933 insertions(+), 42 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 201b97d479c68..6e9cd8807b379 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -135,9 +135,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_xor_b32 s1, s1, s2
-; GFX7-NEXT:    s_and_b32 s0, s0, s1
-; GFX7-NEXT:    s_xor_b32 s0, s2, s0
+; GFX7-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX7-NEXT:    s_and_b32 s0, s1, s0
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
@@ -147,9 +147,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s0, s0, s1
-; GFX8-NEXT:    s_xor_b32 s0, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX8-NEXT:    s_and_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -163,9 +163,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b32 s1, s1, s2
-; GFX10-NEXT:    s_and_b32 s0, s0, s1
-; GFX10-NEXT:    s_xor_b32 s0, s2, s0
+; GFX10-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX10-NEXT:    s_and_b32 s0, s1, s0
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -317,19 +317,26 @@ entry:
 define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
 ; GFX7-LABEL: s_s_v_bfi_sha256_ch:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    v_bfi_b32 v0, v1, s1, v0
+; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_nand_b32 s0, s1, s0
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_s_v_bfi_sha256_ch:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_bfi_b32 v0, v1, s1, v0
+; GFX8-NEXT:    s_not_b32 s1, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    s_nand_b32 s0, s1, s0
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_s_v_bfi_sha256_ch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    v_bfi_b32 v0, s0, s1, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    s_not_b32 s1, s1
+; GFX10-NEXT:    s_nand_b32 s0, s1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
@@ -350,30 +357,40 @@ entry:
   ret float %cast
 }
 
-define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
+define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
 ; GFX7-LABEL: s_v_v_bfi_sha256_ch:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_nand_b32 s0, s1, s0
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_v_v_bfi_sha256_ch:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX8-NEXT:    s_not_b32 s1, s1
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    s_nand_b32 s0, s1, s0
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_v_v_bfi_sha256_ch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    s_not_b32 s1, s1
+; GFX10-NEXT:    s_nand_b32 s0, s1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v1, s1, v0
 ; GFX8-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, s1, v0
 ; GFX10-GISEL-NEXT:    ; return to shader part epilog
 entry:
   %xor0 = xor i32 %y, %z
@@ -1008,24 +1025,32 @@ define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i6
 define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
 ; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7-NEXT:    v_bfi_b32 v1, s3, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    v_bfi_b32 v0, s2, v2, v0
+; GFX7-NEXT:    s_not_b64 s[0:1], s[0:1]
+; GFX7-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX7-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX7-NEXT:    v_and_b32_e32 v1, s1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_bfi_b32 v1, s3, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_bfi_b32 v0, s2, v2, v0
+; GFX8-NEXT:    s_not_b64 s[0:1], s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX8-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v1, s1, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_bfi_b32 v0, s2, s0, v0
-; GFX10-NEXT:    v_bfi_b32 v1, s3, s1, v1
+; GFX10-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX10-NEXT:    s_not_b64 s[0:1], s[0:1]
+; GFX10-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
@@ -1495,9 +1520,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX7-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7-NEXT:    s_add_u32 s0, s0, 10
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -1510,9 +1535,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 10
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1526,9 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_add_u32 s0, s0, 10
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
@@ -1583,9 +1608,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX7-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7-NEXT:    s_add_u32 s0, s0, 10
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -1598,9 +1623,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 10
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1614,9 +1639,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[2:3]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_add_u32 s0, s0, 10
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index fcb871cedd0cb..a14cd8b7a12bb 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -9,6 +10,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_eq_64_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 64, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -22,6 +39,22 @@ define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_64_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 64, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -37,6 +70,23 @@ define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspac
 ; GCN: s_movk_i32 [[K:s[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
 define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_litk_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_movk_i32 s4, 0x3039
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -50,6 +100,22 @@ define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_64_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 64, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -63,6 +129,22 @@ define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
 define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_64_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 63, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -76,6 +158,22 @@ define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_64_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -89,6 +187,22 @@ define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_63_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -103,6 +217,23 @@ define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspa
 ; GCN: s_movk_i32 [[K:s[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
 define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_64_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_movk_i32 s4, 0x41
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -116,6 +247,22 @@ define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_ashrrev_i32_e32 v2, 31, v2
 define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sgt_neg1_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_not_b32_e32 v2, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -129,6 +276,22 @@ define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrs
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
 define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sge_neg2_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -3, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -142,6 +305,22 @@ define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrs
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
 define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_slt_neg16_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, -16, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -155,6 +334,22 @@ define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addr
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
 define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sle_5_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 6, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -172,6 +367,23 @@ define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_eq_64_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -185,6 +397,23 @@ define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_64_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -198,6 +427,23 @@ define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_64_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -211,6 +457,23 @@ define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_64_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -224,6 +487,23 @@ define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_64_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -237,6 +517,23 @@ define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_63_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -253,6 +550,24 @@ define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspa
 ; GCN: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x41
 ; GCN: v_cmp_gt_u64_e32 vcc, [[K]], v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_64_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[4:5], 0x41
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -266,6 +581,24 @@ define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sgt_neg1_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GCN-NEXT:    v_not_b32_e32 v0, v0
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -279,6 +612,23 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sge_neg2_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -3, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -292,6 +642,23 @@ define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrs
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_slt_neg16_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, -16, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -305,6 +672,23 @@ define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addr
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sle_5_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 6, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -323,6 +707,22 @@ define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oeq_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -337,6 +737,22 @@ define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ogt_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -350,6 +766,22 @@ define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oge_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_le_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -363,6 +795,22 @@ define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_olt_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -376,6 +824,22 @@ define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ole_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -389,6 +853,22 @@ define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_one_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lg_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -402,6 +882,22 @@ define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
 define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ord_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -415,6 +911,22 @@ define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ueq_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -428,6 +940,22 @@ define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -441,6 +969,22 @@ define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -454,6 +998,22 @@ define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nle_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -467,6 +1027,22 @@ define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -480,6 +1056,22 @@ define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_une_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, 2.0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -493,6 +1085,22 @@ define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
 define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uno_2.0_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -511,6 +1119,23 @@ define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oeq_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_eq_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -525,6 +1150,23 @@ define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ogt_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -538,6 +1180,23 @@ define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oge_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_le_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -551,6 +1210,23 @@ define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_olt_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -564,6 +1240,23 @@ define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ole_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ge_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -577,6 +1270,23 @@ define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_one_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_lg_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -590,6 +1300,23 @@ define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
 define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ord_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_o_f64_e32 vcc, v[3:4], v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -603,6 +1330,23 @@ define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ueq_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nlg_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -616,6 +1360,23 @@ define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nge_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -629,6 +1390,23 @@ define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ngt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -642,6 +1420,23 @@ define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nle_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -655,6 +1450,23 @@ define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -668,6 +1480,23 @@ define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_une_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_neq_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -681,6 +1510,23 @@ define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
 define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uno_2.0_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[3:4], v[3:4]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -702,6 +1548,26 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc,  [[FI]], v{{[0-9]+}}
 define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 {
+; GCN-LABEL: commute_frameindex:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s14, -1
+; GCN-NEXT:    s_mov_b32 s15, 0xe8f000
+; GCN-NEXT:    s_add_u32 s12, s12, s11
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
 entry:
   %stack0 = alloca i32, addrspace(5)
   %ptr0 = load volatile ptr addrspace(5), ptr addrspace(1) poison