[llvm] [AMDGPU] add s_bitset[10]_b32 optimization for shl+[or, andn2] pattern (PR #134155)

via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 4 17:39:17 PDT 2025


https://github.com/BaoshanPang updated https://github.com/llvm/llvm-project/pull/134155

>From c6dd15301b0f2c6a411dfd0ec68eedf1962eef60 Mon Sep 17 00:00:00 2001
From: Baoshan Pang <bpang at fortinet.com>
Date: Tue, 1 Apr 2025 10:05:08 -0700
Subject: [PATCH 1/2] [AMDGPU] add s_bitset[10]_b32 optimization for
 shl+[or,andn2] pattern

---
 .../Target/AMDGPU/SIShrinkInstructions.cpp    | 55 +++++++++++++-
 llvm/test/CodeGen/AMDGPU/130245.ll            | 23 ++++++
 .../AMDGPU/atomic_optimizations_buffer.ll     | 27 +++----
 .../atomic_optimizations_local_pointer.ll     | 72 +++++++------------
 .../AMDGPU/atomic_optimizations_raw_buffer.ll | 18 ++---
 .../atomic_optimizations_struct_buffer.ll     | 18 ++---
 6 files changed, 121 insertions(+), 92 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/130245.ll

diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 73343e1c80f33..b0b0e46066bc7 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -15,6 +15,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
 
 #define DEBUG_TYPE "si-shrink-instructions"
 
@@ -534,6 +535,56 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
   MachineOperand *SrcReg = Src0;
   MachineOperand *SrcImm = Src1;
 
+  //  case 1:
+  //  From:
+  //  s_lshl_b32 s1, 1, s1
+  //  s_or_b32 s0, s0, s1
+  //  To:
+  //  s_bitset1_b32 s0, s1
+  //
+  //  case 2:
+  //  s_lshl_b32 s1, 1, s1
+  //  s_andn2_b32 s0, s0, s1
+  //  To:
+  //  s_bitset0_b32 s0, s1
+  if ((MI.getOpcode() == AMDGPU::S_OR_B32 ||
+       MI.getOpcode() == AMDGPU::S_ANDN2_B32) &&
+      Src0->isReg() && Src1->isReg() && Dest->getReg() == Src0->getReg()) {
+    MachineInstr *Shl = MRI->getUniqueVRegDef(Src1->getReg());
+    if (Shl && Shl->getOpcode() == AMDGPU::S_LSHL_B32) {
+      MachineOperand &ShlDest = Shl->getOperand(0);
+      MachineOperand &ShlSrc0 = Shl->getOperand(1);
+      MachineOperand &ShlSrc1 = Shl->getOperand(2);
+      const bool IsUndef = SrcReg->isUndef();
+      const bool IsKill = SrcReg->isKill();
+      if (MI.getParent() == Shl->getParent() &&
+          MRI->hasAtMostUserInstrs(ShlDest.getReg(), 2) && ShlSrc0.isImm() &&
+          ShlSrc0.getImm() == 1 && ShlSrc1.isReg()) {
+        unsigned int NewOpc = (MI.getOpcode() == AMDGPU::S_OR_B32)
+                                  ? AMDGPU::S_BITSET1_B32
+                                  : AMDGPU::S_BITSET0_B32;
+        MI.setDesc(TII->get(NewOpc));
+        Src0->setReg(ShlSrc1.getReg());
+        MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
+                                          /*isImp*/ false, IsKill,
+                                          /*isDead*/ false, IsUndef);
+        MI.tieOperands(0, 2);
+        for (auto IE = MI.getIterator(), I = std::next(Shl->getIterator());
+             I != IE; ++I) {
+          for (MachineOperand &MO : I->operands()) {
+            if (MO.isReg() && MO.getReg() == ShlSrc1.getReg()) {
+              if (MO.isKill())
+                MO.setIsKill(false);
+            }
+          }
+        }
+        Shl->eraseFromParent();
+        return true;
+      }
+    }
+    return false;
+  }
+
   if (!SrcImm->isImm() ||
       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
     return false;
@@ -577,8 +628,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
       const bool IsUndef = SrcReg->isUndef();
       const bool IsKill = SrcReg->isKill();
       MI.setDesc(TII->get(Opc));
-      if (Opc == AMDGPU::S_BITSET0_B32 ||
-          Opc == AMDGPU::S_BITSET1_B32) {
+      if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) {
         Src0->ChangeToImmediate(NewImm);
         // Remove the immediate and add the tied input.
         MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
@@ -945,6 +995,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
 
       // Shrink scalar logic operations.
       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+          MI.getOpcode() == AMDGPU::S_ANDN2_B32 ||
           MI.getOpcode() == AMDGPU::S_OR_B32 ||
           MI.getOpcode() == AMDGPU::S_XOR_B32) {
         if (shrinkScalarLogicOp(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/130245.ll b/llvm/test/CodeGen/AMDGPU/130245.ll
new file mode 100644
index 0000000000000..54e7e0f022f59
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/130245.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+define amdgpu_ps i32 @s_bitset1_b32(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset1_b32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_bitset1_b32 s0, s1
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 1, %bit.index
+  %or = or i32 %src0, %set.bit.at.index
+  ret i32 %or
+}
+
+define amdgpu_ps i32 @s_bitset0_b32(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset0_b32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_bitset0_b32 s0, s1
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 1, %bit.index
+  %other.bits = xor i32 %set.bit.at.index, -1
+  %and = and i32 %src0, %other.bits
+  ret i32 %and
+}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 8319e112f526e..bada7eb3eba93 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -728,9 +728,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -813,9 +812,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -898,9 +896,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
@@ -1120,9 +1117,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1211,9 +1207,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1301,9 +1296,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
@@ -2183,9 +2177,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2268,9 +2261,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2354,9 +2346,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 9775a37276dfd..15ad3c03a2b9c 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -620,9 +620,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -706,9 +705,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -4027,9 +4025,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
@@ -4113,9 +4110,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
@@ -6436,9 +6432,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
@@ -6522,9 +6517,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
@@ -7068,10 +7062,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
@@ -7166,10 +7159,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
@@ -7797,9 +7789,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
@@ -7883,9 +7874,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
@@ -8428,10 +8418,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
@@ -8526,10 +8515,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
@@ -9157,9 +9145,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
@@ -9243,9 +9230,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
@@ -9788,10 +9774,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
@@ -9886,10 +9871,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
@@ -10517,9 +10501,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
@@ -10603,9 +10586,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
@@ -12345,9 +12327,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
@@ -12431,9 +12412,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
@@ -14173,9 +14153,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
@@ -14259,9 +14238,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
@@ -15988,9 +15966,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
@@ -16074,9 +16051,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 6a82dbeec5e2f..dc7e010e60c2e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -727,9 +727,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -812,9 +811,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -897,9 +895,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
@@ -1777,9 +1774,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -1862,9 +1858,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -1948,9 +1943,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index dd4c0b0625ea8..f4a5838fc7188 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -747,9 +747,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -834,9 +833,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -920,9 +918,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
@@ -1948,9 +1945,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2035,9 +2031,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2122,9 +2117,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0

>From 52af4933ae4a02126e19823c1d7c60a3a433413e Mon Sep 17 00:00:00 2001
From: Baoshan Pang <bpang at fortinet.com>
Date: Fri, 4 Apr 2025 15:17:02 -0700
Subject: [PATCH 2/2] fix issues found in code review

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  5 +
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  9 ++
 .../Target/AMDGPU/SIShrinkInstructions.cpp    | 55 +----------
 llvm/test/CodeGen/AMDGPU/130245.ll            | 23 -----
 .../atomic_optimizations_global_pointer.ll    | 50 ++++------
 .../atomic_optimizations_local_pointer.ll     | 93 +++++++------------
 llvm/test/CodeGen/AMDGPU/bitset01.ll          | 72 ++++++++++++++
 .../AMDGPU/global_atomics_scan_fadd.ll        | 90 +++++++-----------
 .../AMDGPU/global_atomics_scan_fmax.ll        | 63 +++++--------
 .../AMDGPU/global_atomics_scan_fmin.ll        | 63 +++++--------
 .../AMDGPU/global_atomics_scan_fsub.ll        | 90 +++++++-----------
 .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll    | 28 ++----
 14 files changed, 269 insertions(+), 375 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/130245.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/bitset01.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 536bf0c208752..5024a2104052f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3058,6 +3058,11 @@ bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
   Src = In;
   return true;
 }
+bool AMDGPUDAGToDAGISel::SelectSGPR(SDValue In, SDValue &Src) const {
+  if(In.getNode()->isDivergent()) return false;
+  Src = In;
+  return true;
+}
 
 bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
                                                SDValue &SrcMods,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f3b9364fdb92b..e4aba9ce3dc8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -202,6 +202,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                                        SDValue &SrcMods) const;
   bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
+  bool SelectSGPR(SDValue In, SDValue &Src) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                        SDValue &Clamp, SDValue &Omod) const;
   bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 23a7f508dcda2..e8a39dc042902 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1563,6 +1563,8 @@ def MFMALdScaleModifierOp : TImmLeaf<i32, [{
 // Complex patterns
 //===----------------------------------------------------------------------===//
 
+def SOP1SGPR : ComplexPattern<untyped, 1, "SelectSGPR">;
+
 def DS1Addr1Offset : ComplexPattern<iPTR, 2, "SelectDS1Addr1Offset">;
 def DS64Bit4ByteAligned : ComplexPattern<iPTR, 3, "SelectDS64Bit4ByteAligned">;
 def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9051db0c01ed1..65d45995e7c60 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1255,6 +1255,15 @@ defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>;
 let True16Predicate = UseFakeTrue16Insts in
 defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>;
 
+def : GCNPat <
+  (or (i32 (SOP1SGPR i32:$src)), (shl 1, (i32 (SOP1SGPR i32:$bit_idx)))),
+  (S_BITSET1_B32 i32:$bit_idx, i32:$src)
+>;
+
+def : GCNPat <
+  (i32 (and (i32 (SOP1SGPR i32:$src)),(not (shl 1, (i32 (SOP1SGPR i32:$bit_idx)))))),
+  (i32 (S_BITSET0_B32 i32:$bit_idx, i32:$src))
+>;
 //===----------------------------------------------------------------------===//
 // VOP2 Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index b0b0e46066bc7..73343e1c80f33 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -15,7 +15,6 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineOperand.h"
 
 #define DEBUG_TYPE "si-shrink-instructions"
 
@@ -535,56 +534,6 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
   MachineOperand *SrcReg = Src0;
   MachineOperand *SrcImm = Src1;
 
-  //  case 1:
-  //  From:
-  //  s_lshl_b32 s1, 1, s1
-  //  s_or_b32 s0, s0, s1
-  //  To:
-  //  s_bitset1_b32 s0, s1
-  //
-  //  case 2:
-  //  s_lshl_b32 s1, 1, s1
-  //  s_andn2_b32 s0, s0, s1
-  //  To:
-  //  s_bitset0_b32 s0, s1
-  if ((MI.getOpcode() == AMDGPU::S_OR_B32 ||
-       MI.getOpcode() == AMDGPU::S_ANDN2_B32) &&
-      Src0->isReg() && Src1->isReg() && Dest->getReg() == Src0->getReg()) {
-    MachineInstr *Shl = MRI->getUniqueVRegDef(Src1->getReg());
-    if (Shl && Shl->getOpcode() == AMDGPU::S_LSHL_B32) {
-      MachineOperand &ShlDest = Shl->getOperand(0);
-      MachineOperand &ShlSrc0 = Shl->getOperand(1);
-      MachineOperand &ShlSrc1 = Shl->getOperand(2);
-      const bool IsUndef = SrcReg->isUndef();
-      const bool IsKill = SrcReg->isKill();
-      if (MI.getParent() == Shl->getParent() &&
-          MRI->hasAtMostUserInstrs(ShlDest.getReg(), 2) && ShlSrc0.isImm() &&
-          ShlSrc0.getImm() == 1 && ShlSrc1.isReg()) {
-        unsigned int NewOpc = (MI.getOpcode() == AMDGPU::S_OR_B32)
-                                  ? AMDGPU::S_BITSET1_B32
-                                  : AMDGPU::S_BITSET0_B32;
-        MI.setDesc(TII->get(NewOpc));
-        Src0->setReg(ShlSrc1.getReg());
-        MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
-                                          /*isImp*/ false, IsKill,
-                                          /*isDead*/ false, IsUndef);
-        MI.tieOperands(0, 2);
-        for (auto IE = MI.getIterator(), I = std::next(Shl->getIterator());
-             I != IE; ++I) {
-          for (MachineOperand &MO : I->operands()) {
-            if (MO.isReg() && MO.getReg() == ShlSrc1.getReg()) {
-              if (MO.isKill())
-                MO.setIsKill(false);
-            }
-          }
-        }
-        Shl->eraseFromParent();
-        return true;
-      }
-    }
-    return false;
-  }
-
   if (!SrcImm->isImm() ||
       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
     return false;
@@ -628,7 +577,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
       const bool IsUndef = SrcReg->isUndef();
       const bool IsKill = SrcReg->isKill();
       MI.setDesc(TII->get(Opc));
-      if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) {
+      if (Opc == AMDGPU::S_BITSET0_B32 ||
+          Opc == AMDGPU::S_BITSET1_B32) {
         Src0->ChangeToImmediate(NewImm);
         // Remove the immediate and add the tied input.
         MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
@@ -995,7 +945,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
 
       // Shrink scalar logic operations.
       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
-          MI.getOpcode() == AMDGPU::S_ANDN2_B32 ||
           MI.getOpcode() == AMDGPU::S_OR_B32 ||
           MI.getOpcode() == AMDGPU::S_XOR_B32) {
         if (shrinkScalarLogicOp(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/130245.ll b/llvm/test/CodeGen/AMDGPU/130245.ll
deleted file mode 100644
index 54e7e0f022f59..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/130245.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-define amdgpu_ps i32 @s_bitset1_b32(i32 inreg %src0, i32 inreg %bit.index) {
-; SI-LABEL: s_bitset1_b32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_bitset1_b32 s0, s1
-; SI-NEXT:    ; return to shader part epilog
-  %set.bit.at.index = shl i32 1, %bit.index
-  %or = or i32 %src0, %set.bit.at.index
-  ret i32 %or
-}
-
-define amdgpu_ps i32 @s_bitset0_b32(i32 inreg %src0, i32 inreg %bit.index) {
-; SI-LABEL: s_bitset0_b32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_bitset0_b32 s0, s1
-; SI-NEXT:    ; return to shader part epilog
-  %set.bit.at.index = shl i32 1, %bit.index
-  %other.bits = xor i32 %set.bit.at.index, -1
-  %and = and i32 %src0, %other.bits
-  ret i32 %and
-}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 62083b3e67ab6..5a48c35a29226 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -845,9 +845,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -941,9 +940,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -1036,9 +1034,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
@@ -2509,12 +2506,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2620,13 +2616,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2729,13 +2723,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4380,9 +4373,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -4476,9 +4468,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -4571,9 +4562,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
@@ -6075,12 +6065,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6186,13 +6175,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6295,13 +6282,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 15ad3c03a2b9c..6259f9a7da982 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1174,8 +1174,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1237,9 +1236,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -2157,12 +2154,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2258,13 +2254,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2972,10 +2966,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3042,11 +3035,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s3
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4579,8 +4570,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
@@ -4642,9 +4632,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
@@ -5587,12 +5575,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5688,13 +5675,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -11400,12 +11385,11 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -11506,18 +11490,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -13226,12 +13208,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -13332,18 +13313,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -15043,12 +15022,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_u64_e64 s8, s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -15153,14 +15131,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_u64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -16857,12 +16832,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_u64_e64 s8, s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -16967,14 +16941,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_u64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1132_ITERATIVE-NEXT:    s_and_b32 s3, s8, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/bitset01.ll b/llvm/test/CodeGen/AMDGPU/bitset01.ll
new file mode 100644
index 0000000000000..9ef91aef79270
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitset01.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; https://github.com/llvm/llvm-project/issues/130245
+
+define amdgpu_ps i32 @s_bitset1_b32(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset1_b32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_bitset1_b32 s0, s1
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 1, %bit.index
+  %or = or i32 %src0, %set.bit.at.index
+  ret i32 %or
+}
+
+define amdgpu_ps i32 @s_bitset1_b32_no_opt(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset1_b32_no_opt:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_lshl_b32 s1, 2, s1
+; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 2, %bit.index
+  %or = or i32 %src0, %set.bit.at.index
+  ret i32 %or
+}
+
+define amdgpu_ps float @s_bitset1_b32_v(i32 %src0, i32 %bit.index) {
+; SI-LABEL: s_bitset1_b32_v:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_lshl_or_b32 v0, 1, v1, v0
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 1, %bit.index
+  %or = or i32 %src0, %set.bit.at.index
+  %bc = bitcast i32 %or to float
+  ret float %bc
+}
+
+define amdgpu_ps i32 @s_bitset0_b32(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset0_b32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_bitset0_b32 s0, s1
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 1, %bit.index
+  %other.bits = xor i32 %set.bit.at.index, -1
+  %and = and i32 %src0, %other.bits
+  ret i32 %and
+}
+
+define amdgpu_ps i32 @s_bitset0_b32_no_opt(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset0_b32_no_opt:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_lshl_b32 s1, 10, s1
+; SI-NEXT:    s_andn2_b32 s0, s0, s1
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 10, %bit.index
+  %other.bits = xor i32 %set.bit.at.index, -1
+  %and = and i32 %src0, %other.bits
+  ret i32 %and
+}
+
+define amdgpu_ps float @s_bitset0_b32_v(i32 %src0, i32 %bit.index) {
+; SI-LABEL: s_bitset0_b32_v:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
+; SI-NEXT:    v_not_b32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, v0, v1
+; SI-NEXT:    ; return to shader part epilog
+  %set.bit.at.index = shl i32 1, %bit.index
+  %other.bits = xor i32 %set.bit.at.index, -1
+  %and = and i32 %src0, %other.bits
+  %bc = bitcast i32 %and to float
+  ret float %bc
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 9d5b324b271cb..11ad8261e125f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -604,8 +604,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -709,9 +708,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -1822,8 +1820,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1927,9 +1924,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -3100,8 +3096,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -3205,9 +3200,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -3874,8 +3868,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -3979,9 +3972,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -5151,8 +5143,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -5269,9 +5260,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -7021,10 +7011,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7215,11 +7204,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8750,10 +8738,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8875,11 +8862,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10183,10 +10169,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10308,11 +10293,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11098,10 +11082,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11223,11 +11206,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13077,10 +13059,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13271,11 +13252,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index aad38411d12e6..a636771f2e65e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -511,8 +511,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
@@ -609,12 +608,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1550,8 +1547,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
@@ -1648,12 +1644,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2589,8 +2583,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
@@ -2687,12 +2680,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4394,10 +4385,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4596,11 +4586,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5999,10 +5988,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6118,11 +6106,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -7940,10 +7927,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -8142,11 +8128,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 478daf363820f..3585b27cbd4a0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -511,8 +511,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
@@ -609,12 +608,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1550,8 +1547,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
@@ -1648,12 +1644,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2589,8 +2583,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
@@ -2687,12 +2680,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4394,10 +4385,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4596,11 +4586,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5999,10 +5988,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6118,11 +6106,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -7940,10 +7927,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -8142,11 +8128,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 66f8c2de30530..21b61415e474c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -664,8 +664,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -782,9 +781,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -1994,8 +1992,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -2112,9 +2109,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -3324,8 +3320,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -3442,9 +3437,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -4150,8 +4144,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -4268,9 +4261,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -5479,8 +5471,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -5597,9 +5588,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -7349,10 +7339,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7543,11 +7532,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9077,10 +9065,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9202,11 +9189,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,10 +10496,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10635,11 +10620,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11425,10 +11409,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11550,11 +11533,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13403,10 +13385,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
+; GFX1032-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13597,11 +13578,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_bitset0_b32 s0, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    v_add_f64 v[41:42], v[41:42], s[2:3]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index a6f8be8b7559d..bc52547eb4e8c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7060,10 +7060,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
+; GFX12-NEXT:    s_bitset0_b32 s1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
@@ -7237,12 +7235,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
+; GFX11-NEXT:    s_bitset0_b32 s0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7322,11 +7319,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
+; GFX10-NEXT:    s_bitset0_b32 s0, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7924,10 +7920,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
+; GFX12-NEXT:    s_bitset0_b32 s1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
@@ -8093,12 +8087,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
+; GFX11-NEXT:    s_bitset0_b32 s0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8171,11 +8164,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
+; GFX10-NEXT:    s_bitset0_b32 s0, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0



More information about the llvm-commits mailing list