[llvm] [AMDGPU][CodeGen] Implement SimplifyDemandedBitsForTargetNode for readfirstlane. (PR #190009)

via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 2 11:59:29 PDT 2026


https://github.com/lakreite updated https://github.com/llvm/llvm-project/pull/190009

>From 27f5c06fcdf8e7d643647b5174857dd6272bb26d Mon Sep 17 00:00:00 2001
From: lakreite <lakreite at gmail.com>
Date: Wed, 1 Apr 2026 19:26:44 +0300
Subject: [PATCH 1/4] [AMDGPU] Implement SimplifyDemandedBitsForTargetNode for
 readfirstlane intrinsic

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  25 ++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   8 +
 llvm/test/CodeGen/AMDGPU/always-uniform.ll    |   7 +-
 ...pu-simplify-demanded-bits-readfirstlane.ll |  47 ++++
 .../atomic_optimizations_global_pointer.ll    | 264 +++++++-----------
 .../CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll     |  18 +-
 6 files changed, 184 insertions(+), 185 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e7c5b2cb5cc48..53559ff4d28d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5806,6 +5806,31 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
+bool AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+    unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case ISD::INTRINSIC_WO_CHAIN:
+    switch (Op.getConstantOperandVal(0)) {
+    case Intrinsic::amdgcn_readfirstlane:
+      if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
+                               OriginalDemandedElts, Known, TLO, Depth + 1))
+        return true;
+      break;
+
+    default:
+      break;
+    }
+    break;
+
+  default:
+    break;
+  }
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 18fed2ebe6e6b..ab48e80df2961 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -277,6 +277,14 @@ class AMDGPUTargetLowering : public TargetLowering {
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+                                           const APInt &OriginalDemandedBits,
+                                           const APInt &OriginalDemandedElts,
+                                           KnownBits &Known,
+                                           TargetLoweringOpt &TLO,
+                                           unsigned Depth) const override;
+
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index 689b306518c9b..44dbc0442b68c 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -10,13 +10,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s0, s0, s4
-; GCN-NEXT:    s_addc_u32 s1, s1, s5
-; GCN-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GCN-NEXT:    s_load_dword s4, s[0:1], s4
 ; GCN-NEXT:    s_add_u32 s0, s2, 40
 ; GCN-NEXT:    s_addc_u32 s1, s3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
new file mode 100644
index 0000000000000..96c1343d62a28
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=GCN %s
+
+define void @readfirstlane_demanded_i8_zext_store(i8 %src, ptr addrspace(1) %ptr) {
+; GCN-LABEL: readfirstlane_demanded_i8_zext_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    global_store_byte v[1:2], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %zext = zext i8 %src to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext)
+  %trunc = trunc i32 %readfirstlane to i8
+  store i8 %trunc, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @readfirstlane_demanded_i8_sext_store(i8 %src, ptr addrspace(1) %ptr) {
+; GCN-LABEL: readfirstlane_demanded_i8_sext_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    global_store_byte v[1:2], v0, off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %zext = sext i8 %src to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext)
+  %trunc = trunc i32 %readfirstlane to i8
+  store i8 %trunc, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i16 @readfirstlane_demanded_i16_sext(i16 %src) {
+; GCN-LABEL: readfirstlane_demanded_i16_sext:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %zext = sext i16 %src to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext)
+  %trunc = trunc i32 %readfirstlane to i16
+  ret i16 %trunc
+}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index fbc8b812d96c9..dfd56b0917486 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -8159,10 +8159,9 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s10, -1
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s12
+; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX7LESS-NEXT:    buffer_store_byte v0, off, s[8:11], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -8208,7 +8207,6 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX8-NEXT:  .LBB12_4: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s12
@@ -8260,7 +8258,6 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX9-NEXT:  .LBB12_4: ; %Flow
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s12
@@ -8313,12 +8310,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1064-NEXT:  .LBB12_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1064-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1064-NEXT:    buffer_store_byte v0, off, s[8:11], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -8364,12 +8360,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1032-NEXT:  .LBB12_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1032-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1032-NEXT:    buffer_store_byte v0, off, s[8:11], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -8419,14 +8414,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1164-TRUE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1164-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
 ; GFX1164-TRUE16-NEXT:    buffer_store_b8 v0, off, s[8:11], 0
 ; GFX1164-TRUE16-NEXT:    s_endpgm
@@ -8476,14 +8469,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1164-FAKE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1164-FAKE16-NEXT:    buffer_store_b8 v0, off, s[8:11], 0
 ; GFX1164-FAKE16-NEXT:    s_endpgm
@@ -8531,14 +8522,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1132-TRUE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1132-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
 ; GFX1132-TRUE16-NEXT:    buffer_store_b8 v0, off, s[8:11], 0
 ; GFX1132-TRUE16-NEXT:    s_endpgm
@@ -8586,14 +8575,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1132-FAKE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1132-FAKE16-NEXT:    buffer_store_b8 v0, off, s[8:11], 0
 ; GFX1132-FAKE16-NEXT:    s_endpgm
@@ -8643,13 +8630,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1264-TRUE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1264-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
 ; GFX1264-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
@@ -8701,13 +8687,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1264-FAKE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1264-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -8757,13 +8742,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1232-TRUE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1232-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
 ; GFX1232-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
@@ -8813,13 +8797,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1232-FAKE16-NEXT:  .LBB12_4: ; %Flow
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1232-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -8884,7 +8867,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    s_and_b32 s5, s10, 0xff
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
@@ -8941,7 +8923,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX8-NEXT:  .LBB13_4: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8999,7 +8980,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX9-NEXT:  .LBB13_4: ; %Flow
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9058,10 +9038,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1064-NEXT:  .LBB13_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -9115,10 +9094,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:  .LBB13_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -9178,12 +9156,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -9243,12 +9219,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -9305,12 +9279,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -9367,12 +9339,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -9432,13 +9402,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
@@ -9498,13 +9466,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
@@ -9564,13 +9530,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
@@ -9630,13 +9594,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
@@ -10064,10 +10026,9 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s10, -1
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s12
+; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -10113,7 +10074,6 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX8-NEXT:  .LBB15_4: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s12
@@ -10165,7 +10125,6 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX9-NEXT:  .LBB15_4: ; %Flow
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s12
@@ -10218,12 +10177,11 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1064-NEXT:  .LBB15_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1064-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1064-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -10269,12 +10227,11 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1032-NEXT:  .LBB15_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1032-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -10324,13 +10281,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1164-TRUE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1164-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
 ; GFX1164-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
 ; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX1164-TRUE16-NEXT:    s_endpgm
@@ -10380,14 +10336,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1164-FAKE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX1164-FAKE16-NEXT:    s_endpgm
@@ -10435,13 +10389,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1132-TRUE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1132-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
 ; GFX1132-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
 ; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX1132-TRUE16-NEXT:    s_endpgm
@@ -10489,14 +10442,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1132-FAKE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX1132-FAKE16-NEXT:    s_endpgm
@@ -10546,14 +10497,14 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1264-TRUE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1264-TRUE16-NEXT:    v_cndmask_b16 v0.l, s12, 0, vcc
 ; GFX1264-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
 ; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
 ; GFX1264-TRUE16-NEXT:    s_endpgm
@@ -10603,13 +10554,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s13, v2
 ; GFX1264-FAKE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s12, 0, vcc
 ; GFX1264-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -10659,14 +10609,14 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1232-TRUE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1232-TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, 0, vcc_lo
 ; GFX1232-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-TRUE16-NEXT:    v_or_b16 v0.l, s0, v0.l
 ; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
 ; GFX1232-TRUE16-NEXT:    s_endpgm
@@ -10714,13 +10664,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX1232-FAKE16-NEXT:  .LBB15_4: ; %Flow
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s1, 0, vcc_lo
 ; GFX1232-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -10785,7 +10734,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    s_and_b32 s5, s10, 0xffff
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
@@ -10842,7 +10790,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX8-NEXT:  .LBB16_4: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10900,7 +10847,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX9-NEXT:  .LBB16_4: ; %Flow
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10959,10 +10905,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1064-NEXT:  .LBB16_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -11016,10 +10961,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:  .LBB16_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -11079,11 +11023,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11143,12 +11086,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11205,11 +11146,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11266,12 +11206,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11331,11 +11269,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
@@ -11396,13 +11333,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11462,11 +11397,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
@@ -11527,13 +11461,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-FAKE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
index 0290ffdf161cf..888da732520e9 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -4,24 +4,14 @@
 define amdgpu_gs i32 @main() {
 ; CHECK-LABEL: main:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_bitcmp1_b32 0, 0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
-; CHECK-NEXT:    s_or_saveexec_b32 s2, -1
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readfirstlane_b32 s1, v0
-; CHECK-NEXT:    s_mov_b32 exec_lo, s2
-; CHECK-NEXT:    s_or_b32 s0, s0, s1
-; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
 ; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
-; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-NEXT:    s_xor_b32 s0, s0, -1
-; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
 ; CHECK-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; CHECK-NEXT:    ; return to shader part epilog
 bb:

>From 53cec0261194e11efe2ddd622652de97130c955a Mon Sep 17 00:00:00 2001
From: lakreite <lakreite at gmail.com>
Date: Wed, 1 Apr 2026 19:26:44 +0300
Subject: [PATCH 2/4] [AMDGPU] Implement SimplifyDemandedBitsForTargetNode for
 readfirstlane intrinsic

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   | 10 ++++-----
 ...pu-simplify-demanded-bits-readfirstlane.ll | 21 +++++++++++++++----
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index ab48e80df2961..5c0828bf4d9e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -279,11 +279,11 @@ class AMDGPUTargetLowering : public TargetLowering {
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
   bool SimplifyDemandedBitsForTargetNode(SDValue Op,
-                                           const APInt &OriginalDemandedBits,
-                                           const APInt &OriginalDemandedElts,
-                                           KnownBits &Known,
-                                           TargetLoweringOpt &TLO,
-                                           unsigned Depth) const override;
+                                         const APInt &OriginalDemandedBits,
+                                         const APInt &OriginalDemandedElts,
+                                         KnownBits &Known,
+                                         TargetLoweringOpt &TLO,
+                                         unsigned Depth) const override;
 
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
index 96c1343d62a28..3fc9fa8a7b4c4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
@@ -26,13 +26,26 @@ define void @readfirstlane_demanded_i8_sext_store(i8 %src, ptr addrspace(1) %ptr
 ; GCN-NEXT:    global_store_byte v[1:2], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %zext = sext i8 %src to i32
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext)
+  %sext = sext i8 %src to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sext)
   %trunc = trunc i32 %readfirstlane to i8
   store i8 %trunc, ptr addrspace(1) %ptr
   ret void
 }
 
+define i16 @readfirstlane_demanded_i16_zext(i16 %src) {
+; GCN-LABEL: readfirstlane_demanded_i16_zext:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %zext = zext i16 %src to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext)
+  %trunc = trunc i32 %readfirstlane to i16
+  ret i16 %trunc
+}
+
 define i16 @readfirstlane_demanded_i16_sext(i16 %src) {
 ; GCN-LABEL: readfirstlane_demanded_i16_sext:
 ; GCN:       ; %bb.0:
@@ -40,8 +53,8 @@ define i16 @readfirstlane_demanded_i16_sext(i16 %src) {
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %zext = sext i16 %src to i32
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext)
+  %sext = sext i16 %src to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sext)
   %trunc = trunc i32 %readfirstlane to i16
   ret i16 %trunc
 }

>From 6e12fac2d0909640127a0d099a1b1898a4c8cac8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <arsenm2 at gmail.com>
Date: Thu, 2 Apr 2026 14:47:52 +0200
Subject: [PATCH 3/4] Update
 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll

---
 .../AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
index 3fc9fa8a7b4c4..05832e2387747 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
 
 define void @readfirstlane_demanded_i8_zext_store(i8 %src, ptr addrspace(1) %ptr) {
 ; GCN-LABEL: readfirstlane_demanded_i8_zext_store:

>From b2296cc1d074da5702fe833350074022487ddd41 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <arsenm2 at gmail.com>
Date: Thu, 2 Apr 2026 20:30:55 +0200
Subject: [PATCH 4/4] braces

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 53559ff4d28d0..a6a5521989447 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5813,7 +5813,7 @@ bool AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode(
   switch (Op.getOpcode()) {
   case ISD::INTRINSIC_WO_CHAIN:
     switch (Op.getConstantOperandVal(0)) {
-    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readfirstlane: {
       if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
                                OriginalDemandedElts, Known, TLO, Depth + 1))
         return true;
@@ -5821,9 +5821,8 @@ bool AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode(
 
     default:
       break;
+    } break;
     }
-    break;
-
   default:
     break;
   }



More information about the llvm-commits mailing list