[llvm] 3828ea6 - [AMDGPU] Divergence-driven instruction selection for mul i32
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 22 01:36:46 PDT 2021
Author: Jay Foad
Date: 2021-09-22T09:36:34+01:00
New Revision: 3828ea6181fd007438379de70fc7b9fc9c8dbb02
URL: https://github.com/llvm/llvm-project/commit/3828ea6181fd007438379de70fc7b9fc9c8dbb02
DIFF: https://github.com/llvm/llvm-project/commit/3828ea6181fd007438379de70fc7b9fc9c8dbb02.diff
LOG: [AMDGPU] Divergence-driven instruction selection for mul i32
Differential Revision: https://reviews.llvm.org/D109881
Added:
Modified:
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0a6afe0bdf4a6..ff7c0a87a7f7f 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -622,9 +622,8 @@ def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
[(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
-// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change
def S_MUL_I32 : SOP2_32 <"s_mul_i32",
- [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
+ [(set i32:$sdst, (UniformBinFrag<mul> i32:$src0, i32:$src1))]> {
let isCommutable = 1;
}
} // End isReMaterializable = 1
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index f317a6d265cd7..4b216a4ec1572 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -304,7 +304,7 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteIntMul] in {
-defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
+defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index 96b1c77d0849a..79723603f1c1a 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -76,22 +76,22 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; CHECK-NEXT: s_mov_b32 s5, 0x8311eb33
; CHECK-NEXT: s_mov_b32 s6, 0x20140c
; CHECK-NEXT: s_mov_b32 s7, 0xb6db6db7
-; CHECK-NEXT: s_mov_b32 s11, 0x49249249
-; CHECK-NEXT: s_mov_b32 s8, 0x24924924
-; CHECK-NEXT: s_mov_b32 s9, 0xaaaaaaab
-; CHECK-NEXT: s_mov_b32 s10, 0x2aaaaaaa
+; CHECK-NEXT: s_mov_b32 s8, 0x49249249
+; CHECK-NEXT: s_mov_b32 s9, 0x24924924
+; CHECK-NEXT: s_mov_b32 s10, 0xaaaaaaab
+; CHECK-NEXT: s_mov_b32 s11, 0x2aaaaaaa
; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
; CHECK-NEXT: v_and_b32_e32 v1, s4, v1
; CHECK-NEXT: v_and_b32_e32 v2, s4, v2
; CHECK-NEXT: v_mul_lo_u32 v2, v2, s5
; CHECK-NEXT: v_mul_lo_u32 v1, v1, s7
-; CHECK-NEXT: v_mul_lo_u32 v0, v0, s9
+; CHECK-NEXT: v_mul_lo_u32 v0, v0, s10
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xf9dc299a, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, s11, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, s8, v1
; CHECK-NEXT: v_alignbit_b32 v0, v0, v0, 1
-; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s11, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s9, v1
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 16419132439ac..b19867ea04a52 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -119,9 +119,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI: S_BRANCH %bb.4
; SI: bb.2.Flow:
; SI: successors: %bb.3(0x40000000), %bb.5(0x40000000)
- ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %34:vgpr_32, %bb.1, %10, %bb.4
- ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.1, %9, %bb.4
- ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %38:vgpr_32, %bb.4
+ ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4
+ ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4
+ ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4
; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI: S_BRANCH %bb.3
; SI: bb.3.if:
@@ -133,7 +133,6 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI: successors: %bb.2(0x80000000)
; SI: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec
; SI: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
- ; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_MUL_LO_U32_e64_]]
; SI: S_BRANCH %bb.2
; SI: bb.5.if.end:
; SI: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
@@ -146,8 +145,8 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
; SI: S_BRANCH %bb.6
; SI: bb.6.for.end:
- ; SI: %33:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec
- ; SI: $vgpr0 = COPY killed %33
+ ; SI: %31:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec
+ ; SI: $vgpr0 = COPY killed %31
; SI: SI_RETURN_TO_EPILOG killed $vgpr0
entry:
; %break = icmp sgt i32 %bound, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 7061a2370938e..3d655d23e73c6 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -322,8 +322,6 @@ define hidden amdgpu_gfx i32 @strict_wwm_called(i32 %a) noinline {
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: v_add_u32_e64 v1, v0, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: v_mul_lo_u32 v0, v1, v0
; GFX9-O0-NEXT: v_sub_u32_e64 v0, v0, v1
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
@@ -350,42 +348,36 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
-; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 7
+; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 2
-; GFX9-O0-NEXT: s_mov_b32 s8, s4
-; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 2
-; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
-; GFX9-O0-NEXT: s_mov_b32 s9, s5
+; GFX9-O0-NEXT: s_mov_b32 s9, s8
+; GFX9-O0-NEXT: s_mov_b32 s8, s7
; GFX9-O0-NEXT: s_mov_b32 s10, s6
-; GFX9-O0-NEXT: s_mov_b32 s11, s7
-; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 3
-; GFX9-O0-NEXT: v_writelane_b32 v3, s9, 4
-; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 5
-; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 6
+; GFX9-O0-NEXT: s_mov_b32 s11, s5
+; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX9-O0-NEXT: s_mov_b32 s5, s11
+; GFX9-O0-NEXT: s_mov_b32 s6, s10
+; GFX9-O0-NEXT: s_mov_b32 s7, s8
+; GFX9-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15 killed $sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-O0-NEXT: s_mov_b32 s8, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s9
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT: s_getpc_b64 s[4:5]
-; GFX9-O0-NEXT: s_add_u32 s4, s4, strict_wwm_called at rel32@lo+4
-; GFX9-O0-NEXT: s_addc_u32 s5, s5, strict_wwm_called at rel32@hi+12
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[2:3]
-; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[0:1]
-; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[12:13]
-; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[14:15]
+; GFX9-O0-NEXT: s_getpc_b64 s[12:13]
+; GFX9-O0-NEXT: s_add_u32 s12, s12, strict_wwm_called at rel32@lo+4
+; GFX9-O0-NEXT: s_addc_u32 s13, s13, strict_wwm_called at rel32@hi+12
+; GFX9-O0-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[16:17]
+; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[18:19]
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 3
-; GFX9-O0-NEXT: v_readlane_b32 s5, v3, 4
-; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 5
-; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 6
+; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
@@ -394,7 +386,7 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[4:7], s8 offset:4
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 7
+; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -467,15 +459,11 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mul_lo_u32 v2, v0, v1
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mul_hi_u32 v1, v0, v6
; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mul_lo_u32 v3, v3, v6
; GFX9-O0-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
@@ -485,8 +473,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[1:2]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mul_lo_u32 v6, v0, v6
; GFX9-O0-NEXT: s_mov_b32 s5, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 340e7e56b6584..1a7e432dc0407 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -105,14 +105,14 @@ define hidden i32 @called(i32 %a) noinline {
; GFX9-LABEL: {{^}}call:
define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
-; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}}
+; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}}
; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]]
; GFX9-O0-DAG: v_mov_b32_e32 v2, v0
; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
@@ -299,14 +299,14 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline {
; GFX9-LABEL: {{^}}strict_wwm_call:
define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
-; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}}
+; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}}
; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]]
; GFX9-O0-DAG: v_mov_b32_e32 v2, v0
; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
More information about the llvm-commits
mailing list