[llvm] 200b206 - AMDGPU: Use V_MAC_F32 for fmad.ftz
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 14:41:17 PDT 2020
Author: Matt Arsenault
Date: 2020-03-10T14:41:06-07:00
New Revision: 200b20639ac2804318dee9fddfe79c7fea7bd623
URL: https://github.com/llvm/llvm-project/commit/200b20639ac2804318dee9fddfe79c7fea7bd623
DIFF: https://github.com/llvm/llvm-project/commit/200b20639ac2804318dee9fddfe79c7fea7bd623.diff
LOG: AMDGPU: Use V_MAC_F32 for fmad.ftz
This avoids regressions in a future patch. I'm confused by the use of
the gfx9 usage legacy_mad. Was this a pointless instruction rename, or
uses fmul_legacy handling? Why is regular mac avilable in that case?
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1cace273f05..e0cb46ae6a71 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -848,20 +848,29 @@ def : GCNPat <
// VOP2 Patterns
//===----------------------------------------------------------------------===//
-multiclass FMADPat <ValueType vt, Instruction inst> {
- def : GCNPat <
- (vt (fmad (VOP3NoMods vt:$src0),
- (VOP3NoMods vt:$src1),
- (VOP3NoMods vt:$src2))),
+// TODO: Check only no src2 mods?
+class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
+ : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
+ (vt (VOP3NoMods vt:$src1)),
+ (vt (VOP3NoMods vt:$src2)))),
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
- >;
+>;
+
+
+// Prefer mac form when there are no modifiers.
+let AddedComplexity = 9 in {
+def : FMADPat <f32, V_MAC_F32_e64, fmad>;
+def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+
+let SubtargetPredicate = Has16BitInsts in {
+def : FMADPat <f16, V_MAC_F16_e64, fmad>;
+def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
}
-defm : FMADPat <f16, V_MAC_F16_e64>;
-defm : FMADPat <f32, V_MAC_F32_e64>;
+}
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
: GCNPat<
(Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
(Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@@ -870,9 +879,8 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-// FIXME: This should select to V_MAC_F32
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
-def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
let SubtargetPredicate = Has16BitInsts;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
index da601b86fdb8..05011091a669 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
@@ -19,8 +19,8 @@ body: |
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s32) = COPY $vgpr2
@@ -43,8 +43,8 @@ body: |
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = COPY $vgpr1
@@ -67,8 +67,8 @@ body: |
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s32) = COPY $vgpr1
@@ -91,8 +91,9 @@ body: |
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = COPY $sgpr0
@@ -116,8 +117,8 @@ body: |
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %1
@@ -138,8 +139,9 @@ body: |
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %1, %0
@@ -160,8 +162,9 @@ body: |
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %1, %0, %0
@@ -181,8 +184,9 @@ body: |
; GCN-LABEL: name: fmad_ftz_s32_vsss
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
- ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+ ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0
S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 6d1af9f9eeb8..6d44292ffbe8 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -137,8 +137,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
}
; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
-; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
-; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
+; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
index 30f5156f459c..000353ee1307 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
@@ -5,8 +5,7 @@
declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c)
; GCN-LABEL: {{^}}mad_f16:
-; GFX8: v_ma{{[dc]}}_f16
-; GFX9: v_mad_legacy_f16
+; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
define amdgpu_kernel void @mad_f16(
half addrspace(1)* %r,
half addrspace(1)* %a,
@@ -34,9 +33,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
}
; GCN-LABEL: {{^}}mad_f16_imm_b:
-; GCN: s_movk_i32 [[KB:s[0-9]+]], 0x4800
-; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
-; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
+; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
define amdgpu_kernel void @mad_f16_imm_b(
half addrspace(1)* %r,
half addrspace(1)* %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
index ed62d5f66343..ca35fbcb0182 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_f32_imm_a(
; GCN-LABEL: {{^}}mad_f32_imm_b:
; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000
-; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, [[KB]],
+; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]]
define amdgpu_kernel void @mad_f32_imm_b(
float addrspace(1)* %r,
float addrspace(1)* %a,
@@ -48,8 +48,11 @@ define amdgpu_kernel void @mad_f32_imm_b(
}
; GCN-LABEL: {{^}}mad_f32_imm_c:
-; GCN: v_mov_b32_e32 [[KC:v[0-9]+]], 0x41000000
-; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, {{v[0-9]+}}, [[KC]]{{$}}
+; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000
+; GCN: s_load_dword [[A:s[0-9]+]]
+; GCN: s_load_dword [[B:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
define amdgpu_kernel void @mad_f32_imm_c(
float addrspace(1)* %r,
float addrspace(1)* %a,
More information about the llvm-commits
mailing list