[llvm] [AMDGPU] Fold uniform readfirstlane + cndmask (PR #70188)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 25 03:45:48 PDT 2023


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/70188

>From c128818484d343a07b760ab8ba74173de40b059d Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 25 Oct 2023 11:58:07 +0200
Subject: [PATCH 1/3] [AMDGPU] Fold uniform readfirstlane + cndmask

(Alternative patch for #69703)

Teach SIFoldOperand to fold the a/zext DAGISel pattern that always emits a CNDMASK + READFIRSTLANE, even for uniform comparisons.

Fixes #59869
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  88 +++++++
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  77 +++---
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           | 133 +++++-----
 .../si-fold-readfirstlane-cndmask-w32.mir     | 241 ++++++++++++++++++
 .../si-fold-readfirstlane-cndmask-w64.mir     | 241 ++++++++++++++++++
 5 files changed, 670 insertions(+), 110 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w32.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w64.mir

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1ebfa297f4fc339..d4c652eda715b80 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -104,6 +104,7 @@ class SIFoldOperands : public MachineFunctionPass {
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
   bool tryFoldFoldableCopy(MachineInstr &MI,
                            MachineOperand *&CurrentKnownM0Val) const;
+  bool tryFoldUniformReadFirstLaneCndMask(MachineInstr &MI) const;
 
   const MachineOperand *isClamp(const MachineInstr &MI) const;
   bool tryFoldClamp(MachineInstr &MI);
@@ -1400,6 +1401,88 @@ bool SIFoldOperands::tryFoldFoldableCopy(
   return Changed;
 }
 
+// Try to fold the following pattern:
+//    s_cselect s[2:3], K, 0          ; K has LSB set. Usually it's +-1.
+//    v_cndmask v0, 0, +-1, s[2:3]
+//    v_readfirstlane s0, v0
+//
+// into (for example)
+//
+//    s_cselect s[2:3], K, 0
+//    s_bfe_u64 s0, s[2:3], 0x10000
+bool SIFoldOperands::tryFoldUniformReadFirstLaneCndMask(
+    MachineInstr &MI) const {
+  if (MI.getOpcode() != AMDGPU::V_READFIRSTLANE_B32)
+    return false;
+
+  MachineInstr *RFLSrc = MRI->getVRegDef(MI.getOperand(1).getReg());
+  // We can also have the following pattern:
+  //
+  // %2:vreg_64 = REG_SEQUENCE %X:vgpr_32, sub0, %1:sreg_32, sub1
+  // %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64
+  //
+  // In this case we dig into %X or %Y depending on which sub register
+  // the V_READFIRSTLANE accesses.
+  if (RFLSrc->isRegSequence()) {
+    unsigned RFLSubReg = MI.getOperand(1).getSubReg();
+    if (RFLSrc->getNumOperands() != 5)
+      return false;
+
+    if (RFLSrc->getOperand(2).getImm() == RFLSubReg)
+      RFLSrc = MRI->getVRegDef(RFLSrc->getOperand(1).getReg());
+    else if (RFLSrc->getOperand(4).getImm() == RFLSubReg)
+      RFLSrc = MRI->getVRegDef(RFLSrc->getOperand(3).getReg());
+    else
+      return false;
+  }
+
+  // Need e64 to have a SGPR regmask.
+  if (!RFLSrc || RFLSrc->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+    return false;
+
+  MachineOperand *Src0 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src1);
+  Register Src2 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src2)->getReg();
+
+  if (!Src0->isImm() || Src0->getImm() != 0 || !Src1->isImm())
+    return false;
+
+  // This pattern usually comes from a ext. sext uses -1.
+  bool IsSigned = false;
+  if (Src1->getImm() == -1)
+    IsSigned = true;
+  else if (Src1->getImm() != 1)
+    return false;
+
+  MachineInstr *CSel = MRI->getVRegDef(Src2);
+  if (!CSel || (CSel->getOpcode() != AMDGPU::S_CSELECT_B32 &&
+                CSel->getOpcode() != AMDGPU::S_CSELECT_B64))
+    return false;
+
+  MachineOperand *CSelSrc0 = TII->getNamedOperand(*CSel, AMDGPU::OpName::src0);
+  MachineOperand *CSelSrc1 = TII->getNamedOperand(*CSel, AMDGPU::OpName::src1);
+  // Note: we could also allow any non-zero value for CSelSrc0, and adapt the
+  // BFE's mask depending on where the first set bit is.
+  if (!CSelSrc0->isImm() || (CSelSrc0->getImm() & 1) == 0 ||
+      !CSelSrc1->isImm() || CSelSrc1->getImm() != 0)
+    return false;
+
+  // Replace the V_CNDMASK with S_BFE.
+  unsigned BFEOpc = (IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32);
+
+  // If the CSELECT writes to a 64 bit SGPR, only pick the low bits.
+  unsigned SubReg = 0;
+  if (CSel->getOpcode() == AMDGPU::S_CSELECT_B64)
+    SubReg = AMDGPU::sub0;
+
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(BFEOpc),
+          MI.getOperand(0).getReg())
+      .addReg(Src2, /*Flags*/ 0, SubReg)
+      .addImm(0x10000);
+  MI.eraseFromParent();
+  return true;
+}
+
 // Clamp patterns are canonically selected to v_max_* instructions, so only
 // handle them.
 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
@@ -2087,6 +2170,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
+      if (tryFoldUniformReadFirstLaneCndMask(MI)) {
+        Changed = true;
+        continue;
+      }
+
       // Saw an unknown clobber of m0, so we no longer know what it is.
       if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
         CurrentKnownM0Val = nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 667c561ea26f6f6..52e356a565a5b48 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1536,8 +1536,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; SI-NEXT:    s_or_b32 s2, s5, s2
 ; SI-NEXT:    s_cmp_lg_u32 s2, 0
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; SI-NEXT:    v_readfirstlane_b32 s2, v1
+; SI-NEXT:    s_bfe_u32 s2, s4, 0x10000
 ; SI-NEXT:    s_bfe_u32 s5, s3, 0xb0014
 ; SI-NEXT:    s_or_b32 s2, s6, s2
 ; SI-NEXT:    s_sub_i32 s6, 0x3f1, s5
@@ -1599,8 +1598,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; VI-NEXT:    s_or_b32 s0, s1, s6
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
+; VI-NEXT:    s_bfe_u32 s0, s0, 0x10000
 ; VI-NEXT:    s_bfe_u32 s1, s7, 0xb0014
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_or_b32 s4, s2, s0
@@ -1661,8 +1659,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX9-NEXT:    s_or_b32 s0, s1, s6
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x10000
 ; GFX9-NEXT:    s_bfe_u32 s1, s7, 0xb0014
 ; GFX9-NEXT:    s_or_b32 s6, s2, s0
 ; GFX9-NEXT:    s_sub_i32 s2, 0x3f1, s1
@@ -1714,6 +1711,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s7, 0x1ff
 ; GFX11-NEXT:    s_lshr_b32 s2, s7, 8
@@ -1721,48 +1719,45 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xffe
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX11-NEXT:    s_bfe_u32 s1, s7, 0xb0014
-; GFX11-NEXT:    s_sub_i32 s3, 0x3f1, s1
-; GFX11-NEXT:    s_addk_i32 s1, 0xfc10
-; GFX11-NEXT:    v_med3_i32 v1, s3, 0, 13
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:    s_lshl_b32 s8, s1, 12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_or_b32 s3, s2, 0x1000
-; GFX11-NEXT:    s_or_b32 s8, s2, s8
-; GFX11-NEXT:    s_lshr_b32 s6, s3, s6
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s3, v0
+; GFX11-NEXT:    s_bfe_u32 s3, s7, 0xb0014
+; GFX11-NEXT:    s_bfe_u32 s1, s1, 0x10000
+; GFX11-NEXT:    s_sub_i32 s6, 0x3f1, s3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    v_med3_i32 v0, s6, 0, 13
+; GFX11-NEXT:    s_or_b32 s2, s1, 0x1000
+; GFX11-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX11-NEXT:    s_or_b32 s8, s1, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b32 s6, s2, s6
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s2, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:    s_or_b32 s3, s6, s3
-; GFX11-NEXT:    s_cmp_lt_i32 s1, 1
-; GFX11-NEXT:    s_cselect_b32 s3, s3, s8
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s6, s3, 7
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-NEXT:    s_cselect_b32 s2, s2, s8
+; GFX11-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_gt_i32 s6, 5
 ; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 2
 ; GFX11-NEXT:    s_or_b32 s6, s6, s8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX11-NEXT:    s_addc_u32 s3, s3, 0
-; GFX11-NEXT:    s_cmp_lt_i32 s1, 31
-; GFX11-NEXT:    s_cselect_b32 s3, s3, 0x7c00
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_cmpk_eq_i32 s1, 0x40f
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT:    s_addc_u32 s2, s2, 0
+; GFX11-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-NEXT:    s_cselect_b32 s2, s2, 0x7c00
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
 ; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX11-NEXT:    s_lshr_b32 s1, s7, 16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1770,7 +1765,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s1, v0
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 97216b6c94693c4..9785229c381c556 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -111,12 +111,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI-NEXT:    s_or_b32 s4, s5, s6
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-NEXT:    s_bfe_u32 s4, s7, 0xb0014
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_sub_i32 s6, 0x3f1, s4
-; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc10
-; SI-NEXT:    s_or_b32 s11, s8, s5
+; SI-NEXT:    s_bfe_u32 s5, s7, 0xb0014
+; SI-NEXT:    s_bfe_u32 s4, s4, 0x10000
+; SI-NEXT:    s_sub_i32 s6, 0x3f1, s5
+; SI-NEXT:    s_add_i32 s10, s5, 0xfffffc10
+; SI-NEXT:    s_or_b32 s11, s8, s4
 ; SI-NEXT:    v_med3_i32 v0, s6, 0, 13
 ; SI-NEXT:    s_lshl_b32 s4, s10, 12
 ; SI-NEXT:    s_or_b32 s5, s11, 0x1000
@@ -171,8 +170,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-SAFE-SDAG-NEXT:    s_mov_b32 s1, s5
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s4, s4, 0x10000
 ; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s7, 0xb0014
 ; VI-SAFE-SDAG-NEXT:    s_or_b32 s6, s8, s4
 ; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s5
@@ -299,47 +297,46 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
-; GFX10-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
-; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX10-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
-; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s2, 12
-; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s7, s4, s7
-; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s5, s6
-; GFX10-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX10-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX10-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s3, 0xb0014
+; GFX10-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s2, 0x10000
+; GFX10-SAFE-SDAG-NEXT:    s_sub_i32 s6, 0x3f1, s5
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
+; GFX10-SAFE-SDAG-NEXT:    v_med3_i32 v0, s6, 0, 13
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s2, 0x1000
+; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s5, 0xfc10
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s5, 12
+; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s7, s2, s7
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s4, s6
+; GFX10-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v0, s6
+; GFX10-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s4, v0
 ; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
+; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s6, s4
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s4, 7
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s4, 2
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-SAFE-SDAG-NEXT:    s_addc_u32 s5, s5, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-SAFE-SDAG-NEXT:    s_addc_u32 s4, s4, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s2, s3, 16
 ; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s2, s2, 0x8000
 ; GFX10-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; GFX10-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
+; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s4, v0, vcc_lo
 ; GFX10-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX10-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -428,47 +425,45 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
-; GFX11-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
-; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX11-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
-; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s2, 12
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s7, s4, s7
-; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s5, s6
+; GFX11-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s3, 0xb0014
+; GFX11-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s2, 0x10000
+; GFX11-SAFE-SDAG-NEXT:    s_sub_i32 s6, 0x3f1, s5
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
+; GFX11-SAFE-SDAG-NEXT:    v_med3_i32 v0, s6, 0, 13
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s2, 0x1000
+; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s5, 0xfc10
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s5, 12
+; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s7, s2, s7
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s4, s6
+; GFX11-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v0, s6
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s4, v0
 ; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
+; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s6, s4
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s4, 7
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s4, 2
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX11-SAFE-SDAG-NEXT:    s_addc_u32 s5, s5, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-SAFE-SDAG-NEXT:    s_addc_u32 s4, s4, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s2, s3, 16
 ; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
@@ -476,7 +471,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
+; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s4, v0, vcc_lo
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w32.mir b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w32.mir
new file mode 100644
index 000000000000000..41b95f8ae3d31d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w32.mir
@@ -0,0 +1,241 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+
+---
+name: unsigned_32bits_select1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_32bits_select1
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: unsigned_32bits_select_neg1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_32bits_select_neg1
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: unsigned_32bits_select_neg_arbitrary
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_32bits_select_neg_arbitrary
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 5, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 5, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: signed_32bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: signed_32bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: unsigned_64bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_64bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+    %4:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
+...
+
+---
+name: signed_64bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: signed_64bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+    %4:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
+...
+
+---
+name: unsigned_64bits_double
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_64bits_double
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    ; GCN-NEXT: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1
+    %3:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec
+    %4:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
+...
+
+---
+name: signed_64bits_double
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: signed_64bits_double
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    ; GCN-NEXT: [[S_BFE_I32_1:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B32_]], 65536, implicit-def $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1
+    %3:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec
+    %4:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
+...
+
+---
+name: bad_subreg_64bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_subreg_64bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+    %4:sreg_32 = V_READFIRSTLANE_B32 %3.sub1, implicit $exec
+...
+
+---
+name: bad_select_imm
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_select_imm
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 1, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 1, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: bad_select_imm_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_select_imm_2
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 0, 1, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 0, 1, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: bad_cndmask_imm
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_cndmask_imm
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+
+---
+name: bad_cndmask_imm_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_cndmask_imm_2
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, [[S_CSELECT_B32_]], implicit $exec
+    ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec
+    %0:sreg_32_xm0_xexec = S_CSELECT_B32 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w64.mir b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w64.mir
new file mode 100644
index 000000000000000..6e75270bf5e8fc4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-readfirstlane-cndmask-w64.mir
@@ -0,0 +1,241 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize64" -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+
+---
+name: unsigned_32bits_select1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_32bits_select1
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: unsigned_32bits_select_neg1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_32bits_select_neg1
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: unsigned_32bits_select_neg_arbitrary
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_32bits_select_neg_arbitrary
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 5, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 5, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: signed_32bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: signed_32bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: unsigned_64bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_64bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+    %4:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
+...
+
+---
+name: signed_64bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: signed_64bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+    %4:sreg_32 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
+...
+
+---
+name: unsigned_64bits_double
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: unsigned_64bits_double
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    ; GCN-NEXT: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1
+    %3:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec
+    %4:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
+...
+
+---
+name: signed_64bits_double
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: signed_64bits_double
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1
+    ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    ; GCN-NEXT: [[S_BFE_I32_1:%[0-9]+]]:sreg_32 = S_BFE_I32 [[S_CSELECT_B64_]].sub0, 65536, implicit-def $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1
+    %3:sreg_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec
+    %4:sreg_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
+...
+
+---
+name: bad_subreg_64bits
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_subreg_64bits
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+    %4:sreg_32 = V_READFIRSTLANE_B32 %3.sub1, implicit $exec
+...
+
+---
+name: bad_select_imm
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_select_imm
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 1, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 1, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: bad_select_imm_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_select_imm_2
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 0, 1, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec
+    %0:sreg_64_xexec = S_CSELECT_B64 0, 1, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+---
+name: bad_cndmask_imm
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_cndmask_imm
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 1, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...
+
+
+---
+name: bad_cndmask_imm_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $scc
+    ; GCN-LABEL: name: bad_cndmask_imm_2
+    ; GCN: liveins: $scc
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, [[S_CSELECT_B64_]], implicit $exec
+    ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_CNDMASK_B32_e64_]], implicit $exec
+    %0:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, 0, 0, %0, implicit $exec
+    %2:sreg_32 = V_READFIRSTLANE_B32 %1, implicit $exec
+...

>From e57d6e85831c157687f7e5f3d5632972893ce5ba Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 25 Oct 2023 12:44:48 +0200
Subject: [PATCH 2/3] Add null check for RFLSrc

---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d4c652eda715b80..62c420371f590d4 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1416,6 +1416,9 @@ bool SIFoldOperands::tryFoldUniformReadFirstLaneCndMask(
     return false;
 
   MachineInstr *RFLSrc = MRI->getVRegDef(MI.getOperand(1).getReg());
+  if(!RFLSrc)
+    return false;
+
   // We can also have the following pattern:
   //
   // %2:vreg_64 = REG_SEQUENCE %X:vgpr_32, sub0, %1:sreg_32, sub1

>From 5153268e312ddbd310a7e2de5ce53bf426b902a4 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 25 Oct 2023 12:45:34 +0200
Subject: [PATCH 3/3] clang-format

---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 62c420371f590d4..3b05db0f6aaf6f2 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1416,7 +1416,7 @@ bool SIFoldOperands::tryFoldUniformReadFirstLaneCndMask(
     return false;
 
   MachineInstr *RFLSrc = MRI->getVRegDef(MI.getOperand(1).getReg());
-  if(!RFLSrc)
+  if (!RFLSrc)
     return false;
 
   // We can also have the following pattern:



More information about the llvm-commits mailing list