[llvm] [AMDGPU] Form V_MAD_U64_U32 from mul24/mulhi24 (PR #72393)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 15 06:21:27 PST 2023
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/72393
CGP can transform a fine mul+add into a (mul24/mulhi24)+add, so add a pattern for that.
See SWDEV-421067
>From a2a1412deba4989a7d4edd3f3f7fdece38917430 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 15 Nov 2023 15:20:32 +0100
Subject: [PATCH] [AMDGPU] Form V_MAD_U64_U32 from mul24/mulhi24
CGP can transform a fine mul+add into a (mul24/mulhi24)+add, so add a pattern for that.
See SWDEV-421067
---
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 9 +++
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 63 +++++++++++++++++++
2 files changed, 72 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 114d33b077866a1..06856b03a508124 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -676,6 +676,15 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
(ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
(EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
>;
+
+ // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
+ def : GCNPat <
+ (i64 (add (bitconvert (v2i32 (build_vector
+ (AMDGPUmul_u24 i32:$src0, i32:$src1),
+ (AMDGPUmulhi_u24 i32:$src0, i32:$src1)))),
+ i64:$src2)),
+ (inst $src0, $src1, $src2, 0 /* clamp */)
+ >;
}
// exclude pre-GFX9 where it was slow
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 61017e809c86365..56b6fef1b82255c 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -6928,6 +6928,69 @@ entry:
ret <2 x i16> %add0
}
+define i64 @mul_mulhi_u24(i32 %x, i32 %y, i64 %z) {
+; GFX67-LABEL: mul_mulhi_u24:
+; GFX67: ; %bb.0:
+; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v1
+; GFX67-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v4, v2
+; GFX67-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX67-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mul_mulhi_u24:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_u32_u24_e32 v4, v0, v1
+; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: mul_mulhi_u24:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: mul_mulhi_u24:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1
+; GFX9-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2
+; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: mul_mulhi_u24:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: mul_mulhi_u24:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1
+; GFX10-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
+ %mulhi = call i32 @llvm.amdgcn.mulhi.u24(i32 %x, i32 %y)
+ %mul.zext = zext i32 %mul to i64
+ %mulhi.zext = zext i32 %mulhi to i64
+ %mulhi.shift = shl i64 %mulhi.zext, 32
+ %mul.mulhi = or i64 %mulhi.shift, %mul.zext
+ %add = add nuw nsw i64 %mul.mulhi, %z
+ ret i64 %add
+}
+
+declare i32 @llvm.amdgcn.mul.u24(i32, i32)
+declare i32 @llvm.amdgcn.mulhi.u24(i32, i32)
+declare i32 @llvm.amdgcn.mul.i24(i32, i32)
+declare i32 @llvm.amdgcn.mulhi.i24(i32, i32)
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX6: {{.*}}
; GFX7: {{.*}}
More information about the llvm-commits
mailing list