[llvm] AMDGPU: Optimize set_rounding if input is known to fit in 2 bits (PR #88588)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 18 07:42:19 PDT 2024


================
@@ -0,0 +1,1715 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+
+declare void @llvm.set.rounding(i32)
+declare i32 @llvm.get.rounding()
+
+define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) {
+; GFX678-LABEL: s_set_rounding:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_add_i32 s34, s4, -4
+; GFX678-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX678-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s34, s4, -4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX9-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_add_i32 s34, s4, -4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX10-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s4, -4
+; GFX11-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_kernel:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX6-NEXT:    ;;#ASMSTART
+; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s3, s2, -4
+; GFX6-NEXT:    s_cmp_lt_u32 s2, 4
+; GFX6-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: s_set_rounding_kernel:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s3, s2, -4
+; GFX7-NEXT:    s_cmp_lt_u32 s2, 4
+; GFX7-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX7-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_set_rounding_kernel:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s3, s2, -4
+; GFX8-NEXT:    s_cmp_lt_u32 s2, 4
+; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_set_rounding_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s3, s2, -4
+; GFX9-NEXT:    s_cmp_lt_u32 s2, 4
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_set_rounding_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_i32 s1, s0, -4
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX10-NEXT:    s_cselect_b32 s2, s0, s1
+; GFX10-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_set_rounding_kernel:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s2, s0, s1
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.set.rounding(i32 %rounding)
+  call void asm sideeffect "",""()
+  ret void
+}
+
+define void @v_set_rounding(i32 %rounding) {
+; GFX6-LABEL: v_set_rounding:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, -4, v0
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_set_rounding:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, -4, v0
+; GFX7-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX7-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
+; GFX7-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_set_rounding:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -4, v0
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_set_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -4, v0
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_set_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -4, v0
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_set_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -4, v0
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define void @set_rounding_get_rounding() {
+; GFX678-LABEL: set_rounding_get_rounding:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX678-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX678-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX678-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX678-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX678-NEXT:    s_and_b32 s4, s4, 15
+; GFX678-NEXT:    s_add_i32 s5, s4, 4
+; GFX678-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX678-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX678-NEXT:    s_add_i32 s5, s4, -4
+; GFX678-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX678-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX678-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX678-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX678-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX678-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: set_rounding_get_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX9-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX9-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX9-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX9-NEXT:    s_and_b32 s4, s4, 15
+; GFX9-NEXT:    s_add_i32 s5, s4, 4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX9-NEXT:    s_add_i32 s5, s4, -4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX9-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: set_rounding_get_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_MODE, 0, 4)
+; GFX10-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX10-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX10-NEXT:    s_and_b32 s4, s4, 15
+; GFX10-NEXT:    s_add_i32 s5, s4, 4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    s_add_i32 s5, s4, -4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: set_rounding_get_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 4)
+; GFX11-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX11-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_add_i32 s1, s0, 4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %rounding = call i32 @llvm.get.rounding()
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define void @s_set_rounding_0() {
+; GFX678-LABEL: s_set_rounding_0:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_0:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xf
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 0)
+  ret void
+}
+
+define void @s_set_rounding_1() {
+; GFX678-LABEL: s_set_rounding_1:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 1)
+  ret void
+}
+
+define void @s_set_rounding_2() {
+; GFX678-LABEL: s_set_rounding_2:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_2:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x5
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 2)
+  ret void
+}
+
+define void @s_set_rounding_3() {
+; GFX678-LABEL: s_set_rounding_3:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_3:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xa
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 3)
+  ret void
+}
+
+; Unsupported mode.
+define void @s_set_rounding_4() {
+; GFX678-LABEL: s_set_rounding_4:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_4:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xf
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 4)
+  ret void
+}
+
+; undefined
+define void @s_set_rounding_5() {
+; GFX678-LABEL: s_set_rounding_5:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_5:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 5)
+  ret void
+}
+
+; undefined
+define void @s_set_rounding_6() {
+; GFX678-LABEL: s_set_rounding_6:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_6:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x5
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 6)
+  ret void
+}
+
+; "Dynamic"
+define void @s_set_rounding_7() {
+; GFX678-LABEL: s_set_rounding_7:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_7:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xa
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 7)
+  ret void
+}
+
+; Invalid
+define void @s_set_rounding_neg1() {
+; GFX678-LABEL: s_set_rounding_neg1:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_neg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_neg1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 -1)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; Test extended values
+; --------------------------------------------------------------------
+
+; NearestTiesToEvenF32_TowardPositiveF64 = 8
+define void @s_set_rounding_8() {
+; GFX678-LABEL: s_set_rounding_8:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_8:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x4
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 8)
+  ret void
+}
+
+;  NearestTiesToEvenF32_TowardNegativeF64 = 9
+define void @s_set_rounding_9() {
+; GFX678-LABEL: s_set_rounding_9:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_9:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x8
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 9)
+  ret void
+}
+
+; NearestTiesToEvenF32_TowardZeroF64 = 10
+define void @s_set_rounding_10() {
+; GFX678-LABEL: s_set_rounding_10:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_10:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xc
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 10)
+  ret void
+}
+
+; TowardPositiveF32_NearestTiesToEvenF64 = 11
+define void @s_set_rounding_11() {
+; GFX678-LABEL: s_set_rounding_11:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_11:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_11:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x1
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 11)
+  ret void
+}
+
+; TowardPositiveF32_TowardNegativeF64 = 12
+define void @s_set_rounding_12() {
+; GFX678-LABEL: s_set_rounding_12:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_12:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_12:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x9
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 12)
+  ret void
+}
+
+; TowardPositiveF32_TowardZeroF64 = 13
+define void @s_set_rounding_13() {
+; GFX678-LABEL: s_set_rounding_13:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_13:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_13:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xd
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 13)
+  ret void
+}
+
+;   TowardNegativeF32_NearestTiesToEvenF64 = 14
+define void @s_set_rounding_14() {
+; GFX678-LABEL: s_set_rounding_14:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_14:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_14:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x2
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 14)
+  ret void
+}
+
+; TowardNegativeF32_TowardPositiveF64 = 15
+define void @s_set_rounding_15() {
+; GFX678-LABEL: s_set_rounding_15:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_15:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_15:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x6
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 15)
+  ret void
+}
+
+
+; TowardNegativeF32_TowardZeroF64 = 16
+define void @s_set_rounding_16() {
+; GFX678-LABEL: s_set_rounding_16:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xe
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 16)
+  ret void
+}
+
+;  TowardZeroF32_NearestTiesToEvenF64 = 17
+define void @s_set_rounding_17() {
+; GFX678-LABEL: s_set_rounding_17:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_17:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_17:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x3
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 17)
+  ret void
+}
+
+; TowardZeroF32_TowardPositiveF64 = 18
+define void @s_set_rounding_18() {
+; GFX678-LABEL: s_set_rounding_18:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_18:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_18:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x7
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 18)
+  ret void
+}
+
+; TowardZeroF32_TowardNegativeF64 = 19,
+define void @s_set_rounding_19() {
+; GFX678-LABEL: s_set_rounding_19:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_19:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_19:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 19)
+  ret void
+}
+
+; Invalid, out of bounds
+define void @s_set_rounding_20() {
+; GFX678-LABEL: s_set_rounding_20:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_20:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_20:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 20)
+  ret void
+}
+
+define void @s_set_rounding_0xffff() {
+; GFX678-LABEL: s_set_rounding_0xffff:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_0xffff:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_0xffff:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 65535)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; Test optimization knowing the value can only be in the standard
+; range
+; --------------------------------------------------------------------
+
+define amdgpu_gfx void @s_set_rounding_i2_zeroext(i2 zeroext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i2_zeroext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_lshl_b32 s34, s4, 2
+; GFX6-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i2_zeroext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b32 s34, s4, 2
+; GFX7-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i2_zeroext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX8-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX8-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i2_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX9-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX9-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i2_zeroext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX10-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX10-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i2_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_lshr_b32 s0, 0xa50f, s0
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %zext.rounding = zext i2 %rounding to i32
+  call void @llvm.set.rounding(i32 %zext.rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i2_signext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s34, s4, -4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX6-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX6-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i2_signext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s34, s4, -4
+; GFX7-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX7-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX7-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i2_signext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sext_i32_i16 s34, s4
+; GFX8-NEXT:    s_add_i32 s35, s34, -4
+; GFX8-NEXT:    s_cmp_lt_u32 s34, 4
+; GFX8-NEXT:    s_cselect_b32 s34, s34, s35
+; GFX8-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i2_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sext_i32_i16 s34, s4
+; GFX9-NEXT:    s_add_i32 s35, s34, -4
+; GFX9-NEXT:    s_cmp_lt_u32 s34, 4
+; GFX9-NEXT:    s_cselect_b32 s34, s34, s35
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i2_signext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sext_i32_i16 s34, s4
+; GFX10-NEXT:    s_add_i32 s35, s34, -4
+; GFX10-NEXT:    s_cmp_lt_u32 s34, 4
+; GFX10-NEXT:    s_cselect_b32 s34, s34, s35
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i2_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sext_i32_i16 s0, s4
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sext.rounding = sext i2 %rounding to i32
+  call void @llvm.set.rounding(i32 %sext.rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i3_signext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s34, s4, -4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX6-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX6-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i3_signext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s34, s4, -4
+; GFX7-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX7-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX7-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i3_signext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sext_i32_i16 s34, s4
+; GFX8-NEXT:    s_add_i32 s35, s34, -4
+; GFX8-NEXT:    s_cmp_lt_u32 s34, 4
+; GFX8-NEXT:    s_cselect_b32 s34, s34, s35
+; GFX8-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i3_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sext_i32_i16 s34, s4
+; GFX9-NEXT:    s_add_i32 s35, s34, -4
+; GFX9-NEXT:    s_cmp_lt_u32 s34, 4
+; GFX9-NEXT:    s_cselect_b32 s34, s34, s35
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i3_signext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sext_i32_i16 s34, s4
+; GFX10-NEXT:    s_add_i32 s35, s34, -4
+; GFX10-NEXT:    s_cmp_lt_u32 s34, 4
+; GFX10-NEXT:    s_cselect_b32 s34, s34, s35
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i3_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sext_i32_i16 s0, s4
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sext.rounding = sext i3 %rounding to i32
+  call void @llvm.set.rounding(i32 %sext.rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i3_zeroext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s34, s4, -4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX6-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX6-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i3_zeroext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s34, s4, -4
+; GFX7-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX7-NEXT:    s_cselect_b32 s34, s4, s34
+; GFX7-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i3_zeroext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s36, 0xffff, s4
+; GFX8-NEXT:    v_cmp_lt_u16_e64 s[34:35], s4, 4
+; GFX8-NEXT:    s_add_i32 s37, s36, -4
+; GFX8-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GFX8-NEXT:    s_cselect_b32 s34, s36, s37
+; GFX8-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i3_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s36, 0xffff, s4
+; GFX9-NEXT:    v_cmp_lt_u16_e64 s[34:35], s4, 4
+; GFX9-NEXT:    s_add_i32 s37, s36, -4
+; GFX9-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GFX9-NEXT:    s_cselect_b32 s34, s36, s37
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i3_zeroext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_u16_e64 s34, s4, 4
+; GFX10-NEXT:    s_and_b32 s35, 0xffff, s4
+; GFX10-NEXT:    s_add_i32 s36, s35, -4
+; GFX10-NEXT:    s_and_b32 s34, s34, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s34, s35, s36
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i3_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_u16_e64 s0, s4, 4
----------------
jayfoad wrote:

Not related to your patch, but this is horrible codegen, using v_cmp to compare sgpr values.

https://github.com/llvm/llvm-project/pull/88588


More information about the llvm-commits mailing list