[llvm] 3f34f75 - [AMDGPU] Fix latency for implicit vcc_lo operands on GFX10 wave32

Fri Oct 22 12:15:09 PDT 2021

Author: Jay Foad
Date: 2021-10-22T20:03:29+01:00
New Revision: 3f34f75a68c34cc859d97943673ee44925702d10

URL: https://github.com/llvm/llvm-project/commit/3f34f75a68c34cc859d97943673ee44925702d10
DIFF: https://github.com/llvm/llvm-project/commit/3f34f75a68c34cc859d97943673ee44925702d10.diff

LOG: [AMDGPU] Fix latency for implicit vcc_lo operands on GFX10 wave32

As described in the comment, the way we change vcc to vcc_lo in these
operands confuses addPhysRegDataDeps into treating them as implicit
pseudo operands. Fix this by setting the correct latency from the
SchedModel after addPhysRegDataDeps wrongly set it to 0.

Differential Revision: https://reviews.llvm.org/D112317

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index ffc0c403a2a1..1d3b12380bb9 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -967,6 +967,13 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
       --Lat;
     }
     Dep.setLatency(Lat);
+  } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
+    // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
+    // implicit operands which come from the MCInstrDesc, which can fool
+    // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
+    // pseudo operands.
+    Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
+        DefI, DefOpIdx, UseI, UseOpIdx));
   }
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index ec723b8b879f..e7a80cd30933 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1134,6 +1134,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   static unsigned getDSShaderTypeValue(const MachineFunction &MF);
+
+  const TargetSchedModel &getSchedModel() const { return SchedModel; }
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index c208caf5bc33..97759297493f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -115,8 +115,8 @@ define amdgpu_ps float @s_div_fmas_f32(float inreg %a, float inreg %b, float inr
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_cmp_eq_u32 s3, 0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10_W32-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10_W32-NEXT:    s_and_b32 s3, 1, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
@@ -178,12 +178,12 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10_W32-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10_W32-NEXT:    s_and_b32 s6, 1, s6
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
 ; GFX10_W32-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10_W32-NEXT:    v_readfirstlane_b32 s1, v1
@@ -260,8 +260,8 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
@@ -558,9 +558,9 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s10
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s11
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10_W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -634,8 +634,8 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 1205b52fff1c..bd173aaed0a0 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -83,8 +83,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -332,9 +332,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0x3fb, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0