[llvm] f548c4d - AMDGPU: Add mode register use to s_getreg_b32

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue May 7 07:06:58 PDT 2024


Author: Matt Arsenault
Date: 2024-05-07T16:06:51+02:00
New Revision: f548c4d83cdded0c19ca02ca9c071d8ced9ea4fd

URL: https://github.com/llvm/llvm-project/commit/f548c4d83cdded0c19ca02ca9c071d8ced9ea4fd
DIFF: https://github.com/llvm/llvm-project/commit/f548c4d83cdded0c19ca02ca9c071d8ced9ea4fd.diff

LOG: AMDGPU: Add mode register use to s_getreg_b32

This should fix reading the wrong mode after setting the mode.
Ideally we would have separate pseudos for the case that we know
does not read mode.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/fdiv.ll
    llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 93b7e86b5f297..b05d0018201b8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1110,14 +1110,15 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
 
 // This is hasSideEffects to allow its use in readcyclecounter selection.
 // FIXME: Need to truncate immediate to 16-bits.
-// FIXME: Missing mode register use. Should have separate pseudos for
-// known may read MODE and only read MODE.
+// FIXME: Should have separate pseudos for known may read MODE and
+// only read MODE.
 def S_GETREG_B32 : SOPK_Pseudo <
   "s_getreg_b32",
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
   "$sdst, $simm16",
   [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
   let hasSideEffects = 1;
+  let Uses = [MODE];
 }
 
 let Defs = [MODE], Uses = [MODE] in {

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 1e5f4c08c7a00..0468175c5df50 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -2417,12 +2417,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2455,12 +2455,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2727,12 +2727,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2765,12 +2765,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3294,12 +3294,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3334,12 +3334,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX7-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX7-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX7-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3868,12 +3868,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3906,12 +3906,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -4434,12 +4434,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -4474,12 +4474,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX7-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX7-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX7-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5010,12 +5010,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5048,12 +5048,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5569,12 +5569,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5607,12 +5607,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6113,12 +6113,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, s6, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6153,12 +6153,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, v0, s6, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6619,12 +6619,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, s6, v0, s6
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6659,12 +6659,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, s6, v0, s6
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7168,12 +7168,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7206,12 +7206,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7721,12 +7721,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7759,12 +7759,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 48abc49c41ae0..6a9c4c8d41c20 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1661,5 +1661,132 @@ define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) {
   ret void
 }
 
+define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
+; GFX6-LABEL: get_rounding_after_set_rounding_1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_nop 0
+; GFX6-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX6-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX6-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX6-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    s_and_b32 s0, s0, 15
+; GFX6-NEXT:    s_add_i32 s1, s0, 4
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX6-NEXT:    s_cselect_b32 s4, s0, s1
+; GFX6-NEXT:    s_mov_b32 s0, 0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: get_rounding_after_set_rounding_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_nop 0
+; GFX7-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX7-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX7-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX7-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX7-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX7-NEXT:    s_and_b32 s0, s0, 15
+; GFX7-NEXT:    s_add_i32 s1, s0, 4
+; GFX7-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX7-NEXT:    s_cselect_b32 s4, s0, s1
+; GFX7-NEXT:    s_mov_b32 s0, 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: get_rounding_after_set_rounding_1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX8-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX8-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX8-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 15
+; GFX8-NEXT:    s_add_i32 s1, s0, 4
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX8-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: get_rounding_after_set_rounding_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX9-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX9-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX9-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    s_and_b32 s0, s0, 15
+; GFX9-NEXT:    s_add_i32 s1, s0, 4
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX9-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: get_rounding_after_set_rounding_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX10-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX10-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    s_and_b32 s0, s0, 15
+; GFX10-NEXT:    s_add_i32 s1, s0, 4
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX10-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: get_rounding_after_set_rounding_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_round_mode 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX11-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_add_i32 s1, s0, 4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  tail call void @llvm.set.rounding(i32 1)
+  %set.mode = tail call i32 @llvm.get.rounding()
+  store volatile i32 %set.mode, ptr addrspace(1) null
+  ret void
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}


        


More information about the llvm-commits mailing list