[llvm] b4e751e - AMDGPU: Optimize set_rounding if input is known to fit in 2 bits (#88588)

via llvm-commits llvm-commits at lists.llvm.org
Fri May 3 02:17:22 PDT 2024


Author: Matt Arsenault
Date: 2024-05-03T11:17:18+02:00
New Revision: b4e751e2ab0ff152ed18dea59ebf9691e963e1dd

URL: https://github.com/llvm/llvm-project/commit/b4e751e2ab0ff152ed18dea59ebf9691e963e1dd
DIFF: https://github.com/llvm/llvm-project/commit/b4e751e2ab0ff152ed18dea59ebf9691e963e1dd.diff

LOG: AMDGPU: Optimize set_rounding if input is known to fit in 2 bits (#88588)

We don't need to figure out the weird extended rounding modes or
handle offsets to keep the lookup table in 64-bits.
    
https://reviews.llvm.org/D153258

Depends #88587

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ee1e610e640bd2..0a2686ebe9316e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4076,34 +4076,54 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
     NewMode = DAG.getConstant(
         AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
   } else {
-    SDValue BitTable =
-        DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
-
+    // If we know the input can only be one of the supported standard modes in
+    // the range 0-3, we can use a simplified mapping to hardware values.
+    KnownBits KB = DAG.computeKnownBits(NewMode);
+    const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
     // The supported standard values are 0-3. The extended values start at 8. We
     // need to offset by 4 if the value is in the extended range.
 
-    // is_standard = value < 4;
-    // table_index = is_standard ? value : (value - 4)
-    // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
+    if (UseReducedTable) {
+      // Truncate to the low 32-bits.
+      SDValue BitTable = DAG.getConstant(
+          AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
+
+      SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+      SDValue RoundModeTimesNumBits =
+          DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
 
-    SDValue Four = DAG.getConstant(4, SL, MVT::i32);
-    SDValue IsStandardValue =
-        DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
-    SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
-    SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
-                                   NewMode, OffsetEnum);
+      NewMode =
+          DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
+
+      // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
+      // the table extracted bits into inline immediates.
+    } else {
+      // is_standard = value < 4;
+      // table_index = is_standard ? value : (value - 4)
+      // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
+      SDValue BitTable =
+          DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
 
-    SDValue Two = DAG.getConstant(2, SL, MVT::i32);
-    SDValue RoundModeTimesNumBits =
-        DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
+      SDValue Four = DAG.getConstant(4, SL, MVT::i32);
+      SDValue IsStandardValue =
+          DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
+      SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
 
-    SDValue TableValue =
-        DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
-    SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+      SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
+                                     NewMode, OffsetEnum);
 
-    // No need to mask out the high bits since the setreg will ignore them
-    // anyway.
-    NewMode = TruncTable;
+      SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+      SDValue RoundModeTimesNumBits =
+          DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
+
+      SDValue TableValue =
+          DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
+      SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+
+      // No need to mask out the high bits since the setreg will ignore them
+      // anyway.
+      NewMode = TruncTable;
+    }
 
     // Insert a readfirstlane in case the value is a VGPR. We could do this
     // earlier and keep more operations scalar, but that interferes with

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 0f8571ef07cdfa..ca90f6fd88514a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -885,86 +885,52 @@ define amdgpu_gfx void @s_set_rounding_i2_zeroext(i2 zeroext inreg %rounding) {
 ; GFX6-LABEL: s_set_rounding_i2_zeroext:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_or_b32 s34, s4, -4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 4
-; GFX6-NEXT:    s_cselect_b32 s34, s4, s34
-; GFX6-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT:    s_lshl_b32 s34, s4, 2
+; GFX6-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: s_set_rounding_i2_zeroext:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_or_b32 s34, s4, -4
-; GFX7-NEXT:    s_cmp_lt_u32 s4, 4
-; GFX7-NEXT:    s_cselect_b32 s34, s4, s34
-; GFX7-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT:    s_lshl_b32 s34, s4, 2
+; GFX7-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_set_rounding_i2_zeroext:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s36, 0xffff, s4
-; GFX8-NEXT:    v_cmp_lt_u16_e64 s[34:35], s4, 4
-; GFX8-NEXT:    s_or_b32 s37, s36, -4
-; GFX8-NEXT:    s_and_b64 s[34:35], s[34:35], exec
-; GFX8-NEXT:    s_cselect_b32 s34, s36, s37
-; GFX8-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX8-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX8-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_set_rounding_i2_zeroext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s36, 0xffff, s4
-; GFX9-NEXT:    v_cmp_lt_u16_e64 s[34:35], s4, 4
-; GFX9-NEXT:    s_or_b32 s37, s36, -4
-; GFX9-NEXT:    s_and_b64 s[34:35], s[34:35], exec
-; GFX9-NEXT:    s_cselect_b32 s34, s36, s37
-; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX9-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX9-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_set_rounding_i2_zeroext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_u16_e64 s34, s4, 4
-; GFX10-NEXT:    s_and_b32 s35, 0xffff, s4
-; GFX10-NEXT:    s_or_b32 s36, s35, -4
-; GFX10-NEXT:    s_and_b32 s34, s34, exec_lo
-; GFX10-NEXT:    s_cselect_b32 s34, s35, s36
-; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX10-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX10-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: s_set_rounding_i2_zeroext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_u16_e64 s0, s4, 4
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s4
-; GFX11-NEXT:    s_or_b32 s2, s1, -4
-; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s0, s1, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_lshr_b32 s0, 0xa50f, s0
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %zext.rounding = zext i2 %rounding to i32
@@ -1244,10 +1210,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[34:35], v0
+; GFX6-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s34, v0
 ; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1258,10 +1222,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT:    v_lshr_b64 v[0:1], s[34:35], v0
+; GFX7-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -1272,10 +1234,9 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX8-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
 ; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1286,10 +1247,9 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX9-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
 ; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1300,10 +1260,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
 ; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1314,10 +1272,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1332,14 +1288,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX678-NEXT:    s_cselect_b32 s34, 1, 3
-; GFX678-NEXT:    s_or_b32 s35, s34, -4
-; GFX678-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX678-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_cselect_b32 s34, 0xa50, 10
 ; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1347,14 +1296,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s34, 1, 3
-; GFX9-NEXT:    s_or_b32 s35, s34, -4
-; GFX9-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX9-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_cselect_b32 s34, 0xa50, 10
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1362,14 +1304,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, 1, 3
-; GFX10-NEXT:    s_or_b32 s35, s34, -4
-; GFX10-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX10-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_cselect_b32 s34, 0xa50, 10
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1377,14 +1312,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, 1, 3
-; GFX11-NEXT:    s_or_b32 s1, s0, -4
-; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
-; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_cselect_b32 s0, 0xa50, 10
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0
@@ -1394,66 +1322,22 @@ define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
 }
 
 define void @v_set_rounding_select_1_3(i32 %cond) {
-; GFX6-LABEL: v_set_rounding_select_1_3:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 3, 1, vcc
-; GFX6-NEXT:    v_or_b32_e32 v1, -4, v0
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT:    s_mov_b32 s4, 0x1c84a50f
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_mov_b32 s5, 0xb73e62d9
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
-; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_set_rounding_select_1_3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 3, 1, vcc
-; GFX7-NEXT:    v_or_b32_e32 v1, -4, v0
-; GFX7-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT:    s_mov_b32 s4, 0x1c84a50f
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_mov_b32 s5, 0xb73e62d9
-; GFX7-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
-; GFX7-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_set_rounding_select_1_3:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 3, 1, vcc
-; GFX8-NEXT:    v_or_b32_e32 v1, -4, v0
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    s_mov_b32 s4, 0x1c84a50f
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_mov_b32 s5, 0xb73e62d9
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
-; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX678-LABEL: v_set_rounding_select_1_3:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xa50
+; GFX678-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX678-NEXT:    v_cndmask_b32_e32 v0, 10, v1, vcc
+; GFX678-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_set_rounding_select_1_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xa50
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 3, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v1, -4, v0
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x1c84a50f
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0xb73e62d9
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 10, v1, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1462,14 +1346,7 @@ define void @v_set_rounding_select_1_3(i32 %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s5, 0xb73e62d9
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 3, 1, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v1, -4, v0
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 4, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1478,14 +1355,7 @@ define void @v_set_rounding_select_1_3(i32 %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 3, 1, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v1, -4, v0
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1496,23 +1366,42 @@ define void @v_set_rounding_select_1_3(i32 %cond) {
 }
 
 define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_2_0:
-; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX678-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX678-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX678-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX678-NEXT:    s_lshl_b32 s34, s34, 1
-; GFX678-NEXT:    s_or_b32 s35, s34, -4
-; GFX678-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX678-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
-; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-LABEL: s_set_rounding_select_2_0:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX6-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_select_2_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX7-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_select_2_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
+; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_set_rounding_select_2_0:
 ; GFX9:       ; %bb.0:
@@ -1520,15 +1409,10 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
 ; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX9-NEXT:    s_lshl_b32 s34, s34, 1
-; GFX9-NEXT:    s_or_b32 s35, s34, -4
-; GFX9-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX9-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1538,15 +1422,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
 ; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX10-NEXT:    s_lshl_b32 s34, s34, 1
-; GFX10-NEXT:    s_or_b32 s35, s34, -4
-; GFX10-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX10-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1556,15 +1434,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_or_b32 s1, s0, -4
-; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
-; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0
@@ -1578,14 +1450,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX678-NEXT:    s_cselect_b32 s34, 2, 1
-; GFX678-NEXT:    s_or_b32 s35, s34, -4
-; GFX678-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX678-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_movk_i32 s34, 0xa5
+; GFX678-NEXT:    s_cselect_b32 s34, s34, 0xa50
 ; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1593,14 +1459,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s34, 2, 1
-; GFX9-NEXT:    s_or_b32 s35, s34, -4
-; GFX9-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX9-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_movk_i32 s34, 0xa5
+; GFX9-NEXT:    s_cselect_b32 s34, s34, 0xa50
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1608,14 +1468,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, 2, 1
-; GFX10-NEXT:    s_or_b32 s35, s34, -4
-; GFX10-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX10-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_movk_i32 s34, 0xa5
+; GFX10-NEXT:    s_cselect_b32 s34, s34, 0xa50
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1623,14 +1477,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, 2, 1
-; GFX11-NEXT:    s_or_b32 s1, s0, -4
-; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
-; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_movk_i32 s0, 0xa5
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0xa50
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0
@@ -1644,14 +1492,8 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX678-NEXT:    s_cselect_b32 s34, 1, 2
-; GFX678-NEXT:    s_or_b32 s35, s34, -4
-; GFX678-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX678-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_movk_i32 s34, 0xa50
+; GFX678-NEXT:    s_cselect_b32 s34, s34, 0xa5
 ; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1659,14 +1501,8 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s34, 1, 2
-; GFX9-NEXT:    s_or_b32 s35, s34, -4
-; GFX9-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX9-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_movk_i32 s34, 0xa50
+; GFX9-NEXT:    s_cselect_b32 s34, s34, 0xa5
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1674,14 +1510,8 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, 1, 2
-; GFX10-NEXT:    s_or_b32 s35, s34, -4
-; GFX10-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX10-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_movk_i32 s34, 0xa50
+; GFX10-NEXT:    s_cselect_b32 s34, s34, 0xa5
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1689,14 +1519,8 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, 1, 2
-; GFX11-NEXT:    s_or_b32 s1, s0, -4
-; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
-; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_movk_i32 s0, 0xa50
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0xa5
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0
@@ -1710,14 +1534,7 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX678-NEXT:    s_cselect_b32 s34, 3, 0
-; GFX678-NEXT:    s_or_b32 s35, s34, -4
-; GFX678-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX678-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_cselect_b32 s34, 10, 0xa50f
 ; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1725,14 +1542,7 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s34, 3, 0
-; GFX9-NEXT:    s_or_b32 s35, s34, -4
-; GFX9-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX9-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_cselect_b32 s34, 10, 0xa50f
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1740,14 +1550,7 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, 3, 0
-; GFX10-NEXT:    s_or_b32 s35, s34, -4
-; GFX10-NEXT:    s_cmp_lt_u32 s34, 4
-; GFX10-NEXT:    s_cselect_b32 s34, s34, s35
-; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
-; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_cselect_b32 s34, 10, 0xa50f
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1755,14 +1558,7 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, 3, 0
-; GFX11-NEXT:    s_or_b32 s1, s0, -4
-; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
-; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_cselect_b32 s0, 10, 0xa50f
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0


        


More information about the llvm-commits mailing list