[llvm] AMDGPU GlobalISel Add64 support (PR #124763)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 30 05:54:47 PST 2025
https://github.com/tpopp updated https://github.com/llvm/llvm-project/pull/124763
>From 467405361e65691ea756ca62449740ba0bc44fa4 Mon Sep 17 00:00:00 2001
From: Tres Popp <git at tpopp.com>
Date: Tue, 28 Jan 2025 06:13:44 -0800
Subject: [PATCH] AMDGPU GlobalISel G_ADD and G_PTR_ADD 64 support
This considers hasLshlAddB64 support and adds patterns for ptradd.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 18 +++-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 18 ++++
llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 88 +++++++++++++++----
3 files changed, 106 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e9e47eaadd557f..977cbca309aeea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -730,13 +730,29 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
} else {
- getActionDefinitionsBuilder({G_ADD, G_SUB})
+ getActionDefinitionsBuilder(G_SUB)
.legalFor({S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
+ if (ST.hasLshlAddB64())
+ getActionDefinitionsBuilder(G_ADD)
+ .legalFor({S64, S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
+ else
+ getActionDefinitionsBuilder(G_ADD)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
}
if (ST.hasScalarSMulU64()) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c06c932a5375e8..e2d1ec7556086e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -737,6 +737,24 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+let SubtargetPredicate = isGFX940Plus in {
+// TODO: Canonicalize these in the target specific CombinerHelper?
+def : GCNPat<
+ (ptradd (shl i64:$src0, i32:$shift), i64:$src1),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$shift, VSrc_b64:$src1)
+>;
+
+def : GCNPat<
+ (ptradd i64:$src0, (shl i64:$src1, i32:$shift)),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0)
+>;
+
+def : GCNPat<
+ (ptradd i64:$src0, i64:$src1),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1)
+>;
+}
+
def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index e9a1b38eee157d..5c60e27b1b3842 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -1,8 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v1v:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 1
%add = add i64 %shl, %a
ret i64 %add
@@ -10,7 +14,10 @@ define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v4v:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 4
%add = add i64 %shl, %a
ret i64 %add
@@ -18,8 +25,10 @@ define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v5v:
-; GCN: v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 5
%add = add i64 %shl, %a
ret i64 %add
@@ -27,8 +36,10 @@ define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
; GCN-LABEL: lshl_add_u64_vvv:
-; GCN: v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, %s
%add = add i64 %shl, %a
ret i64 %add
@@ -36,7 +47,13 @@ define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
; GCN-LABEL: lshl_add_u64_s2v:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%a = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
@@ -46,7 +63,13 @@ define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
; GCN-LABEL: lshl_add_u64_v2s:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%v = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
@@ -56,9 +79,14 @@ define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_s2s:
-; GCN: s_lshl_b64
-; GCN: s_add_u32
-; GCN: s_addc_u32
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, ptr undef
@@ -67,14 +95,23 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
define i64 @add_u64_vv(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_vv:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
ret i64 %add
}
define amdgpu_kernel void @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
-; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%a = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -83,7 +120,13 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
define amdgpu_kernel void @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%v = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -92,8 +135,14 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
-; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s0, s2
+; GCN-NEXT: s_addc_u32 s1, s1, s3
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
@@ -101,7 +150,12 @@ define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
; GCN-LABEL: lshl_add_u64_gep:
-; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GCN-NEXT: flat_load_dword v0, v[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i32, ptr %p, i64 %a
%v = load i32, ptr %gep
ret i32 %v
More information about the llvm-commits
mailing list