[llvm] 2b63933 - [AMDGPU][SDag] Better lowering for 32-bit ctlz/cttz

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 5 07:57:52 PDT 2021


Author: Jay Foad
Date: 2021-08-05T15:57:40+01:00
New Revision: 2b63933115f75f4b06c317a77fdef0e4e5656976

URL: https://github.com/llvm/llvm-project/commit/2b63933115f75f4b06c317a77fdef0e4e5656976
DIFF: https://github.com/llvm/llvm-project/commit/2b63933115f75f4b06c317a77fdef0e4e5656976.diff

LOG: [AMDGPU][SDag] Better lowering for 32-bit ctlz/cttz

Differential Revision: https://reviews.llvm.org/D107566

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/ctlz.ll
    llvm/test/CodeGen/AMDGPU/cttz.ll
    llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5e30e1c1341b..e8a46e050974 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2386,8 +2386,16 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
                    Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
 
   if (Src.getValueType() == MVT::i32) {
-    assert(ZeroUndef);
-    return DAG.getNode(NewOpc, SL, MVT::i32, Src);
+    // (ctlz hi:lo) -> (umin (ffbh src), 32)
+    // (cttz hi:lo) -> (umin (ffbl src), 32)
+    // (ctlz_zero_undef src) -> (ffbh src)
+    // (cttz_zero_undef src) -> (ffbl src)
+    SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
+    if (!ZeroUndef) {
+      const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
+      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+    }
+    return NewOpr;
   }
 
   SDValue Lo, Hi;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c3d9ea4381c2..eba19106d4f0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -465,11 +465,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasBCNT(64))
     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 
-  if (Subtarget->hasFFBH())
+  if (Subtarget->hasFFBH()) {
+    setOperationAction(ISD::CTLZ, MVT::i32, Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+  }
 
-  if (Subtarget->hasFFBL())
+  if (Subtarget->hasFFBL()) {
+    setOperationAction(ISD::CTTZ, MVT::i32, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+  }
 
   // We only really have 32-bit BFE instructions (and 16-bit on VI).
   //

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index c51e12500ff5..1b1fd1b5a630 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; SI-LABEL: s_ctlz_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_flbit_i32_b32 s5, s4
+; SI-NEXT:    s_flbit_i32_b32 s2, s2
+; SI-NEXT:    s_min_u32 s4, s2, 32
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -41,9 +40,8 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b32 s1, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cselect_b32 s0, s1, 32
+; VI-NEXT:    s_flbit_i32_b32 s0, s0
+; VI-NEXT:    s_min_u32 s0, s0, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -68,8 +66,7 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_flbit_i32_b32 s0, s4
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s0, s0, 32
+; GFX10-NEXT:    s_min_u32 s0, s0, 32
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-NEXT:    s_endpgm
@@ -98,17 +95,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -126,9 +122,8 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -157,14 +152,13 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32:
@@ -203,12 +197,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v2, v1
-; SI-NEXT:    v_ffbh_u32_e32 v3, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v1, 32, v1
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -226,12 +218,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v1
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; VI-NEXT:    v_ffbh_u32_e32 v3, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v1, 32, v1
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -263,17 +253,15 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_ffbh_u32_e32 v3, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v2i32:
@@ -315,18 +303,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v4, v3
-; SI-NEXT:    v_ffbh_u32_e32 v5, v2
-; SI-NEXT:    v_ffbh_u32_e32 v6, v1
-; SI-NEXT:    v_ffbh_u32_e32 v7, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
+; SI-NEXT:    v_ffbh_u32_e32 v3, v3
+; SI-NEXT:    v_ffbh_u32_e32 v2, v2
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v3, 32, v3
+; SI-NEXT:    v_min_u32_e32 v2, 32, v2
+; SI-NEXT:    v_min_u32_e32 v1, 32, v1
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -344,18 +328,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v4, v3
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
-; VI-NEXT:    v_ffbh_u32_e32 v5, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
-; VI-NEXT:    v_ffbh_u32_e32 v6, v1
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
-; VI-NEXT:    v_ffbh_u32_e32 v7, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v3, v3
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v3, 32, v3
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
+; VI-NEXT:    v_min_u32_e32 v1, 32, v1
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -397,18 +377,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v6, v2
-; GFX10-NEXT:    v_ffbh_u32_e32 v7, v1
-; GFX10-NEXT:    v_ffbh_u32_e32 v8, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 32, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 32, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v8, vcc_lo
+; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
+; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -455,9 +431,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
 ; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -474,9 +449,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
 ; VI-NEXT:    v_add_u16_e32 v0, -8, v0
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -520,9 +494,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
+; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
 ; GFX10-NEXT:    v_add_nc_u16 v1, v1, -8
 ; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
@@ -1152,9 +1125,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1174,9 +1146,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1211,13 +1182,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1263,9 +1233,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1285,9 +1254,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1322,13 +1290,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1493,11 +1460,11 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 32, v1, s[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
-; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[0:1]
+; VI-NEXT:    v_min_u32_e32 v1, 32, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT:    v_mov_b32_e32 v0, 0xffff
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1538,9 +1505,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v1, vcc_lo
+; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 7172260f36eb..54f7e238b230 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; SI-LABEL: s_cttz_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_ff1_i32_b32 s5, s4
+; SI-NEXT:    s_ff1_i32_b32 s2, s2
+; SI-NEXT:    s_min_u32 s4, s2, 32
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -41,9 +40,8 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b32 s1, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cselect_b32 s0, s1, 32
+; VI-NEXT:    s_ff1_i32_b32 s0, s0
+; VI-NEXT:    s_min_u32 s0, s0, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -68,8 +66,7 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_ff1_i32_b32 s0, s4
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s0, s0, 32
+; GFX10-NEXT:    s_min_u32 s0, s0, 32
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-NEXT:    s_endpgm
@@ -98,17 +95,16 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbl_b32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbl_b32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -126,9 +122,8 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -157,14 +152,13 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbl_b32_e32 v1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32:
@@ -203,12 +197,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbl_b32_e32 v2, v1
-; SI-NEXT:    v_ffbl_b32_e32 v3, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
+; SI-NEXT:    v_ffbl_b32_e32 v1, v1
+; SI-NEXT:    v_ffbl_b32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v1, 32, v1
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -226,12 +218,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v2, v1
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; VI-NEXT:    v_ffbl_b32_e32 v3, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v1, v1
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v1, 32, v1
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -263,17 +253,15 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_ffbl_b32_e32 v3, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v2i32:
@@ -315,18 +303,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbl_b32_e32 v4, v3
-; SI-NEXT:    v_ffbl_b32_e32 v5, v2
-; SI-NEXT:    v_ffbl_b32_e32 v6, v1
-; SI-NEXT:    v_ffbl_b32_e32 v7, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
+; SI-NEXT:    v_ffbl_b32_e32 v3, v3
+; SI-NEXT:    v_ffbl_b32_e32 v2, v2
+; SI-NEXT:    v_ffbl_b32_e32 v1, v1
+; SI-NEXT:    v_ffbl_b32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v3, 32, v3
+; SI-NEXT:    v_min_u32_e32 v2, 32, v2
+; SI-NEXT:    v_min_u32_e32 v1, 32, v1
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -344,18 +328,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v4, v3
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
-; VI-NEXT:    v_ffbl_b32_e32 v5, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
-; VI-NEXT:    v_ffbl_b32_e32 v6, v1
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
-; VI-NEXT:    v_ffbl_b32_e32 v7, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v3, v3
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
+; VI-NEXT:    v_ffbl_b32_e32 v1, v1
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v3, 32, v3
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
+; VI-NEXT:    v_min_u32_e32 v1, 32, v1
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -397,18 +377,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbl_b32_e32 v5, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_ffbl_b32_e32 v6, v2
-; GFX10-NEXT:    v_ffbl_b32_e32 v7, v1
-; GFX10-NEXT:    v_ffbl_b32_e32 v8, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 32, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 32, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v8, vcc_lo
+; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
+; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1141,9 +1117,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbl_b32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbl_b32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1163,9 +1138,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1200,13 +1174,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbl_b32_e32 v1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1252,9 +1225,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbl_b32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbl_b32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1274,9 +1246,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1311,13 +1282,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbl_b32_e32 v1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1482,9 +1452,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT:    v_ffbl_b32_e32 v3, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v3, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -1526,10 +1495,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-NEXT:    v_ffbl_b32_e32 v3, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 32, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index d13edccf27c2..36e4773f53e1 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -782,9 +782,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_ffbl_b32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbl_b32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -820,9 +819,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v2, 32, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -1365,9 +1363,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_ffbl_b32_e32 v1, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    v_ffbl_b32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1405,9 +1402,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -1605,9 +1601,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT:    v_ffbl_b32_e32 v3, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v3, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 7abd17ab49f0..59ef31d06d49 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -9,21 +9,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_flbit_i32_b32 s8, s3
 ; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, s3, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[2:3], v2
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_flbit_i32_b32 s0, s3
+; GFX6-NEXT:    s_min_u32 s8, s0, 32
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[2:3], s0, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX6-NEXT:    v_or_b32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_sub_i32 s0, 32, s8
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -33,8 +31,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_flbit_i32_b32 s4, s3
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s6, s4, 32
+; GFX8-NEXT:    s_min_u32 s6, s4, 32
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], s2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -67,8 +64,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
+; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -93,8 +89,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
 ; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_ffbh_u32_e32 v4, v2
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, 32, v4, vcc
+; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[1:2]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -122,21 +117,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_flbit_i32_b32 s8, s3
 ; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, s3, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[2:3], v2
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_flbit_i32_b32 s0, s3
+; GFX6-NEXT:    s_min_u32 s8, s0, 32
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[2:3], s0, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX6-NEXT:    v_or_b32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_sub_i32 s0, 32, s8
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -145,8 +138,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_flbit_i32_b32 s4, s3
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX8-NEXT:    s_cselect_b32 s6, s4, 32
+; GFX8-NEXT:    s_min_u32 s6, s4, 32
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], s2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -178,8 +170,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
+; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -203,8 +194,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
 ; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v2
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, 32, v0, vcc
+; GFX8-NEXT:    v_min_u32_e32 v5, 32, v0
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, v[1:2]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -236,56 +226,50 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)*
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_flbit_i32_b32 s8, s7
 ; GFX6-NEXT:    s_flbit_i32_b32 s9, s5
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, s7, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s9
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, s5, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 32, v0, vcc
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[6:7], v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 32, v2
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX6-NEXT:    s_min_u32 s8, s8, 32
+; GFX6-NEXT:    s_min_u32 s9, s9, 32
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
+; GFX6-NEXT:    s_sub_i32 s10, 32, s8
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
+; GFX6-NEXT:    s_sub_i32 s11, 32, s9
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[8:9], s6, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[8:9], s4, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX6-NEXT:    v_or_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s5, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; GFX6-NEXT:    v_ldexp_f32_e32 v1, v0, v5
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v4
+; GFX6-NEXT:    v_ldexp_f32_e64 v1, v0, s10
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v2, s11
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_flbit_i32_b32 s2, s7
-; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX8-NEXT:    s_cselect_b32 s9, s2, 32
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s9
-; GFX8-NEXT:    s_sub_i32 s9, 32, s9
+; GFX8-NEXT:    s_flbit_i32_b32 s6, s3
+; GFX8-NEXT:    s_min_u32 s8, s6, 32
+; GFX8-NEXT:    s_flbit_i32_b32 s7, s1
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX8-NEXT:    s_min_u32 s9, s7, 32
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], s2, 0
-; GFX8-NEXT:    s_flbit_i32_b32 s8, s5
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
-; GFX8-NEXT:    s_cselect_b32 s6, s8, 32
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
 ; GFX8-NEXT:    v_or_b32_e32 v0, s3, v0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[4:5], s6
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], s2, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], s0, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; GFX8-NEXT:    s_sub_i32 s2, 32, s6
-; GFX8-NEXT:    v_ldexp_f32 v1, v0, s9
-; GFX8-NEXT:    v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_sub_i32 s0, 32, s8
+; GFX8-NEXT:    v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT:    s_sub_i32 s0, 32, s9
+; GFX8-NEXT:    v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
   %result = uitofp <2 x i64> %in to <2 x float>
@@ -314,14 +298,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_ffbh_u32_e32 v12, v8
 ; GFX6-NEXT:    v_ffbh_u32_e32 v13, v6
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 32, v9, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, 32, v12, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, 32, v13, vcc
+; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX6-NEXT:    v_min_u32_e32 v9, 32, v9
+; GFX6-NEXT:    v_min_u32_e32 v12, 32, v12
+; GFX6-NEXT:    v_min_u32_e32 v13, 32, v13
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
@@ -374,16 +354,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
 ; GFX8-NEXT:    v_ffbh_u32_e32 v12, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v8
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
 ; GFX8-NEXT:    v_ffbh_u32_e32 v11, v6
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, 32, v11, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, 32, v12, vcc
 ; GFX8-NEXT:    v_ffbh_u32_e32 v13, v2
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, 32, v13, vcc
+; GFX8-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX8-NEXT:    v_min_u32_e32 v11, 32, v11
+; GFX8-NEXT:    v_min_u32_e32 v12, 32, v12
+; GFX8-NEXT:    v_min_u32_e32 v13, 32, v13
 ; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v0, v[7:8]
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
@@ -433,26 +409,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_flbit_i32_b32 s8, s7
 ; GFX6-NEXT:    s_flbit_i32_b32 s9, s5
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, s7, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s9
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, s5, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 32, v0, vcc
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[6:7], v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 32, v2
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX6-NEXT:    s_min_u32 s8, s8, 32
+; GFX6-NEXT:    s_min_u32 s9, s9, 32
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
+; GFX6-NEXT:    s_sub_i32 s10, 32, s8
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
+; GFX6-NEXT:    s_sub_i32 s11, 32, s9
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[8:9], s6, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[8:9], s4, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX6-NEXT:    v_or_b32_e32 v0, s7, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s5, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s10
+; GFX6-NEXT:    v_ldexp_f32_e64 v1, v1, s11
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -466,24 +438,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_flbit_i32_b32 s2, s7
-; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX8-NEXT:    s_cselect_b32 s9, s2, 32
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s9
-; GFX8-NEXT:    s_sub_i32 s9, 32, s9
+; GFX8-NEXT:    s_flbit_i32_b32 s3, s5
+; GFX8-NEXT:    s_min_u32 s8, s2, 32
+; GFX8-NEXT:    s_min_u32 s9, s3, 32
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s8
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], s2, 0
-; GFX8-NEXT:    s_flbit_i32_b32 s8, s5
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
-; GFX8-NEXT:    s_cselect_b32 s6, s8, 32
 ; GFX8-NEXT:    v_or_b32_e32 v0, s3, v0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[4:5], s6
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[4:5], s9
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], s2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT:    s_sub_i32 s2, 32, s6
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, s9
+; GFX8-NEXT:    s_sub_i32 s8, 32, s8
+; GFX8-NEXT:    s_sub_i32 s2, 32, s9
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, s8
 ; GFX8-NEXT:    v_ldexp_f32 v1, v1, s2
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -518,14 +488,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_ffbh_u32_e32 v12, v8
 ; GFX6-NEXT:    v_ffbh_u32_e32 v13, v6
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 32, v9, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, 32, v12, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, 32, v13, vcc
+; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX6-NEXT:    v_min_u32_e32 v9, 32, v9
+; GFX6-NEXT:    v_min_u32_e32 v12, 32, v12
+; GFX6-NEXT:    v_min_u32_e32 v13, 32, v13
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
@@ -584,16 +550,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
 ; GFX8-NEXT:    v_ffbh_u32_e32 v13, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v8
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
 ; GFX8-NEXT:    v_ffbh_u32_e32 v12, v6
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, 32, v12, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, 32, v13, vcc
 ; GFX8-NEXT:    v_ffbh_u32_e32 v14, v2
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, 32, v14, vcc
+; GFX8-NEXT:    v_min_u32_e32 v0, 32, v0
+; GFX8-NEXT:    v_min_u32_e32 v12, 32, v12
+; GFX8-NEXT:    v_min_u32_e32 v13, 32, v13
+; GFX8-NEXT:    v_min_u32_e32 v14, 32, v14
 ; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v0, v[7:8]
 ; GFX8-NEXT:    v_sub_u32_e32 v15, vcc, 32, v0
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v12, v[5:6]


        


More information about the llvm-commits mailing list