[llvm] AMDGPU GlobalISel Add64 support (PR #124763)
Alan Li via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 28 07:06:17 PST 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/124763
>From 9849ed9142f1231a1904667beb2226bf8d3d9e84 Mon Sep 17 00:00:00 2001
From: Tres Popp <git at tpopp.com>
Date: Tue, 28 Jan 2025 06:13:44 -0800
Subject: [PATCH 1/3] AMDGPU GlobalISel G_ADD and G_PTR_ADD 64 support
This considers hasLshlAddB64 support and adds patterns for ptradd.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 18 +++-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 18 ++++
llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 87 +++++++++++++++----
3 files changed, 105 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 740e52fb87dc2..dcf7a0777178d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -736,13 +736,29 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
} else {
- getActionDefinitionsBuilder({G_ADD, G_SUB})
+ getActionDefinitionsBuilder(G_SUB)
.legalFor({S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
+ if (ST.hasLshlAddB64())
+ getActionDefinitionsBuilder(G_ADD)
+ .legalFor({S64, S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
+ else
+ getActionDefinitionsBuilder(G_ADD)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
}
if (ST.hasScalarSMulU64()) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 1447804871809..9108760a1f6c7 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -762,6 +762,24 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+let SubtargetPredicate = isGFX940Plus in {
+// TODO: Canonicalize these in the target specific CombinerHelper?
+def : GCNPat<
+ (ptradd (shl i64:$src0, i32:$shift), i64:$src1),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$shift, VSrc_b64:$src1)
+>;
+
+def : GCNPat<
+ (ptradd i64:$src0, (shl i64:$src1, i32:$shift)),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0)
+>;
+
+def : GCNPat<
+ (ptradd i64:$src0, i64:$src1),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1)
+>;
+}
+
def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index 4262cc44a6e74..a7ccaf79ecae6 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -2,7 +2,10 @@
define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v1v:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 1
%add = add i64 %shl, %a
ret i64 %add
@@ -10,7 +13,10 @@ define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v4v:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 4
%add = add i64 %shl, %a
ret i64 %add
@@ -18,8 +24,10 @@ define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v5v:
-; GCN: v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 5
%add = add i64 %shl, %a
ret i64 %add
@@ -27,8 +35,10 @@ define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
; GCN-LABEL: lshl_add_u64_vvv:
-; GCN: v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, %s
%add = add i64 %shl, %a
ret i64 %add
@@ -36,7 +46,13 @@ define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
; GCN-LABEL: lshl_add_u64_s2v:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%a = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
@@ -46,7 +62,13 @@ define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
; GCN-LABEL: lshl_add_u64_v2s:
-; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%v = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
@@ -56,9 +78,14 @@ define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_s2s:
-; GCN: s_lshl_b64
-; GCN: s_add_u32
-; GCN: s_addc_u32
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, ptr undef
@@ -67,14 +94,23 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
define i64 @add_u64_vv(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_vv:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
ret i64 %add
}
define amdgpu_kernel void @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
-; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%a = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -83,7 +119,13 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
define amdgpu_kernel void @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN: ; %bb.0:
+; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%v = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -92,8 +134,14 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
-; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s0, s2
+; GCN-NEXT: s_addc_u32 s1, s1, s3
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
+; GCN-NEXT: s_endpgm
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
@@ -101,7 +149,12 @@ define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
; GCN-LABEL: lshl_add_u64_gep:
-; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GCN-NEXT: flat_load_dword v0, v[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i32, ptr %p, i64 %a
%v = load i32, ptr %gep
ret i32 %v
>From eb6f6d857bab0acb43d841d0e6b3259ad7ffd3bc Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Thu, 27 Feb 2025 09:01:00 -0500
Subject: [PATCH 2/3] Update according to comments
Signed-off-by: Alan Li <me at alanli.org>
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 2 ++
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 27 ++++++++-----------
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 13 +++++----
3 files changed, 19 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index effc8d2ed6b49..35b6b830e332c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2520,6 +2520,8 @@ def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
+def HasLShlAddB64 : Predicate<"Subtarget->hasLshlAddB64()">;
+
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index dcf7a0777178d..73afd70d87685 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -743,22 +744,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
- if (ST.hasLshlAddB64())
- getActionDefinitionsBuilder(G_ADD)
- .legalFor({S64, S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
- .scalarize(0)
- .minScalar(0, S16)
- .widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
- else
- getActionDefinitionsBuilder(G_ADD)
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
- .scalarize(0)
- .minScalar(0, S16)
- .widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
+
+ getActionDefinitionsBuilder(G_ADD)
+ .legalFor(ST.hasLshlAddB64()
+ ? std::initializer_list<LLT>{S32, S16, V2S16, S64}
+ : std::initializer_list<LLT>{S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
}
if (ST.hasScalarSMulU64()) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 9108760a1f6c7..e9a9ffda831cc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -762,7 +762,7 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
-let SubtargetPredicate = isGFX940Plus in {
+let SubtargetPredicate = HasLShlAddB64 in {
// TODO: Canonicalize these in the target specific CombinerHelper?
def : GCNPat<
(ptradd (shl i64:$src0, i32:$shift), i64:$src1),
@@ -778,17 +778,16 @@ def : GCNPat<
(ptradd i64:$src0, i64:$src1),
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1)
>;
-}
-def : GCNPat<
- (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
- (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
-
-let SubtargetPredicate = isGFX940Plus in
def : GCNPat<
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
>;
+} // End SubtargetPredicate = HasLShlAddB64
+
+def : GCNPat<
+ (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
+ (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
>From 5cdbd92f7bcd50cefa84165f98a8016ba2ca6ca2 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Fri, 28 Feb 2025 09:42:52 -0500
Subject: [PATCH 3/3] update test cases
---
llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 172 ++++++++++++++++-------
1 file changed, 120 insertions(+), 52 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index a7ccaf79ecae6..c3c2d4cca8617 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -1,4 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefix=GI %s
define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v1v:
@@ -6,6 +8,12 @@ define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_v1v:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 1
%add = add i64 %shl, %a
ret i64 %add
@@ -17,6 +25,12 @@ define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_v4v:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 4
%add = add i64 %shl, %a
ret i64 %add
@@ -28,6 +42,13 @@ define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_v5v:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshlrev_b64 v[0:1], 5, v[0:1]
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 5
%add = add i64 %shl, %a
ret i64 %add
@@ -39,57 +60,77 @@ define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_vvv:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, %s
%add = add i64 %shl, %a
ret i64 %add
}
-define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
+define i64 @lshl_add_u64_s2v(i64 %v) {
; GCN-LABEL: lshl_add_u64_s2v:
; GCN: ; %bb.0:
-; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_s2v:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%a = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
- store i64 %add, ptr undef
- ret void
+ ret i64 %add
}
-define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
+define i64 @lshl_add_u64_v2s(i64 %a) {
; GCN-LABEL: lshl_add_u64_v2s:
; GCN: ; %bb.0:
-; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_v2s:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GI-NEXT: s_setpc_b64 s[30:31]
%v = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
- store i64 %add, ptr undef
- ret void
+ ret i64 %add
}
-define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
+define i64 @lshl_add_u64_s2s(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_s2s:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_s2s:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
- store i64 %add, ptr undef
- ret void
+ ret i64 %add
}
define i64 @add_u64_vv(i64 %v, i64 %a) {
@@ -98,53 +139,72 @@ define i64 @add_u64_vv(i64 %v, i64 %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: add_u64_vv:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
ret i64 %add
}
-define amdgpu_kernel void @add_u64_sv(i64 %v) {
+define i64 @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: add_u64_sv:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%a = load i64, ptr undef
%add = add i64 %v, %a
- store i64 %add, ptr undef
- ret void
+ ret i64 %add
}
-define amdgpu_kernel void @add_u64_vs(i64 %a) {
+define i64 @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
; GCN: ; %bb.0:
-; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: add_u64_vs:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GI-NEXT: s_setpc_b64 s[30:31]
%v = load i64, ptr undef
%add = add i64 %v, %a
- store i64 %add, ptr undef
- ret void
+ ret i64 %add
}
-define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
+define i64 @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_add_u32 s0, s0, s2
-; GCN-NEXT: s_addc_u32 s1, s1, s3
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GCN-NEXT: s_endpgm
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: add_u64_ss:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GI-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
- store i64 %add, ptr undef
- ret void
+ ret i64 %add
}
define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
@@ -155,6 +215,14 @@ define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
; GCN-NEXT: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: lshl_add_u64_gep:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GI-NEXT: flat_load_dword v0, v[0:1]
+; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i32, ptr %p, i64 %a
%v = load i32, ptr %gep
ret i32 %v
More information about the llvm-commits
mailing list