[llvm] b25c001 - AMDGPU: Fold zext into result of v_mad_u16 on high zeroing targets
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 5 15:41:15 PDT 2023
Author: Matt Arsenault
Date: 2023-06-05T18:41:07-04:00
New Revision: b25c001ad3f33695f82ec06c33b33d248686f4ab
URL: https://github.com/llvm/llvm-project/commit/b25c001ad3f33695f82ec06c33b33d248686f4ab
DIFF: https://github.com/llvm/llvm-project/commit/b25c001ad3f33695f82ec06c33b33d248686f4ab.diff
LOG: AMDGPU: Fold zext into result of v_mad_u16 on high zeroing targets
Avoids regressions in future patch.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/CodeGen/AMDGPU/mad.u16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index ed02c445d64c0..91c6e8ba4f880 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -212,6 +212,12 @@ class is_canonicalized<SDPatternOperator op> : PatFrag<
}];
}
+class FoldTernaryOpPat<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+ (ops node:$src0, node:$src1, node:$src2),
+ (op2 (op1 node:$src0, node:$src1), node:$src2)
+>;
+
+def imad : FoldTernaryOpPat<mul, add>;
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 65db790fedf1c..8216139110e7a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -381,36 +381,43 @@ def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus]
-let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
+// Note: 16-bit instructions produce a 0 result in the high 16-bits
+// on GFX8 and GFX9 and preserve high 16 bits on GFX10+
+multiclass Arithmetic_i16_0Hi_TernaryPats <SDPatternOperator op, Instruction inst> {
+ def : GCNPat<
+ (i32 (zext (op i16:$src0, i16:$src1, i16:$src2))),
+ (inst VSrc_b16:$src0, VSrc_b16:$src1, VSrc_b16:$src2)
+ >;
+}
-multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
- Instruction inst> {
-def : GCNPat <
- (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
- (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
->;
+let Predicates = [Has16BitInsts, isGFX8GFX9] in {
+defm : Arithmetic_i16_0Hi_TernaryPats<imad, V_MAD_U16_e64>;
+}
+
+let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
+// FIXME: Should be able to just pass imad to the instruction
+// definition pattern, but the implied clamp input interferes.
+multiclass Ternary_i16_Pats <SDPatternOperator op, Instruction inst> {
+ def : GCNPat <
+ (op i16:$src0, i16:$src1, i16:$src2),
+ (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
+ >;
}
-defm: Ternary_i16_Pats<mul, add, V_MAD_U16_e64>;
-defm: Ternary_i16_Pats<mul, add, V_MAD_I16_e64>;
+defm: Ternary_i16_Pats<imad, V_MAD_U16_e64>;
} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
- Instruction inst> {
-def : GCNPat <
+class Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst> : GCNPat <
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
>;
-}
-
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
-
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+def: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
} // End Predicates = [Has16BitInsts, isGFX10Plus]
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index c70dabed3561c..2eff6019cfb5e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -136,15 +136,13 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
; GFX8-LABEL: v_mad_u16_zext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
-; GFX8-NEXT: v_add_u16_e32 v0, v0, v2
+; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_u16_zext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
-; GFX9-NEXT: v_add_u16_e32 v0, v0, v2
+; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mad_u16_zext:
@@ -173,16 +171,14 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
; GFX8-LABEL: v_mad_u16_zext64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
-; GFX8-NEXT: v_add_u16_e32 v0, v0, v2
+; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_u16_zext64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
-; GFX9-NEXT: v_add_u16_e32 v0, v0, v2
+; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
More information about the llvm-commits
mailing list