[llvm] 1f53f20 - [AMDGPU] Support gfx940 v_lshl_add_u64 instruction
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 14 15:45:53 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-03-14T15:45:42-07:00
New Revision: 1f53f20fc1c709d2538b1566b5d16f152a794d35
URL: https://github.com/llvm/llvm-project/commit/1f53f20fc1c709d2538b1566b5d16f152a794d35
DIFF: https://github.com/llvm/llvm-project/commit/1f53f20fc1c709d2538b1566b5d16f152a794d35.diff
LOG: [AMDGPU] Support gfx940 v_lshl_add_u64 instruction
Differential Revision: https://reviews.llvm.org/D121401
Added:
llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
Modified:
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
llvm/test/MC/AMDGPU/gfx940_asm_features.s
llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index df9b2c8b6e9f3..1029822573bbc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -894,6 +894,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasMovB64() const { return GFX940Insts; }
+ bool hasLshlAddB64() const { return GFX940Insts; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bfd641fdd9e74..12a5aedb84b63 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4071,6 +4071,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+
+ if (IsAdd && ST.hasLshlAddB64()) {
+ auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .addImm(0)
+ .add(Src1);
+ TII->legalizeOperands(*Add);
+ MI.eraseFromParent();
+ return BB;
+ }
+
const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -4079,10 +4094,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
Register CarryReg = MRI.createVirtualRegister(CarryRC);
Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
- MachineOperand &Dest = MI.getOperand(0);
- MachineOperand &Src0 = MI.getOperand(1);
- MachineOperand &Src1 = MI.getOperand(2);
-
const TargetRegisterClass *Src0RC = Src0.isReg()
? MRI.getRegClass(Src0.getReg())
: &AMDGPU::VReg_64RegClass;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 494e3aeb6d556..024be304a1908 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -609,6 +609,23 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
}];
}
+def shl_0_to_4 : PatFrag<
+ (ops node:$src0, node:$src1), (shl node:$src0, node:$src1),
+ [{
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ return C->getZExtValue() <= 4;
+ }
+ return false;
+ }]> {
+ let GISelPredicateCode = [{
+ int64_t Imm = 0;
+ if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) &&
+ !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm))))
+ return false;
+ return (uint64_t)Imm <= 4;
+ }];
+}
+
let SubtargetPredicate = isGFX9Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -649,6 +666,10 @@ defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I
defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
} // End isReMaterializable = 1
+// V_LSHL_ADD_U64: D0.u64 = (S0.u64 << S1.u[2:0]) + S2.u64
+// src0 is shifted left by 0-4 (use “0” to get ADD_U64).
+let SubtargetPredicate = isGFX940Plus in
+defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
// This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
@@ -664,6 +685,12 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+let SubtargetPredicate = isGFX940Plus in
+def : GCNPat<
+ (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
+>;
+
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
@@ -1273,3 +1300,5 @@ defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx9 <0x1f2>;
defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
+
+defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
new file mode 100644
index 0000000000000..61a4f8fb32cdd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -0,0 +1,108 @@
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
+; GCN-LABEL: lshl_add_u64_v1v:
+; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}]
+ %shl = shl i64 %v, 1
+ %add = add i64 %shl, %a
+ ret i64 %add
+}
+
+define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
+; GCN-LABEL: lshl_add_u64_v4v:
+; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}]
+ %shl = shl i64 %v, 4
+ %add = add i64 %shl, %a
+ ret i64 %add
+}
+
+define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
+; GCN-LABEL: lshl_add_u64_v5v:
+; GCN: v_lshlrev_b64
+; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+ %shl = shl i64 %v, 5
+ %add = add i64 %shl, %a
+ ret i64 %add
+}
+
+define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
+; GCN-LABEL: lshl_add_u64_vvv:
+; GCN: v_lshlrev_b64
+; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+ %shl = shl i64 %v, %s
+ %add = add i64 %shl, %a
+ ret i64 %add
+}
+
+define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
+; GCN-LABEL: lshl_add_u64_s2v:
+; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}]
+ %a = load i64, i64* undef
+ %shl = shl i64 %v, 2
+ %add = add i64 %shl, %a
+ store i64 %add, i64* undef
+ ret void
+}
+
+define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
+; GCN-LABEL: lshl_add_u64_v2s:
+; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}]
+ %v = load i64, i64* undef
+ %shl = shl i64 %v, 2
+ %add = add i64 %shl, %a
+ store i64 %add, i64* undef
+ ret void
+}
+
+define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
+; GCN-LABEL: lshl_add_u64_s2s:
+; GCN: s_lshl_b64
+; GCN: s_add_u32
+; GCN: s_addc_u32
+ %shl = shl i64 %v, 2
+ %add = add i64 %shl, %a
+ store i64 %add, i64* undef
+ ret void
+}
+
+define i64 @add_u64_vv(i64 %v, i64 %a) {
+; GCN-LABEL: add_u64_vv:
+; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+ %add = add i64 %v, %a
+ ret i64 %add
+}
+
+define amdgpu_kernel void @add_u64_sv(i64 %v) {
+; GCN-LABEL: add_u64_sv:
+; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+ %a = load i64, i64* undef
+ %add = add i64 %v, %a
+ store i64 %add, i64* undef
+ ret void
+}
+
+define amdgpu_kernel void @add_u64_vs(i64 %a) {
+; GCN-LABEL: add_u64_vs:
+; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+ %v = load i64, i64* undef
+ %add = add i64 %v, %a
+ store i64 %add, i64* undef
+ ret void
+}
+
+define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
+; GCN-LABEL: add_u64_ss:
+; GCN: s_add_u32
+; GCN: s_addc_u32 s1, s1, s3
+ %add = add i64 %v, %a
+ store i64 %add, i64* undef
+ ret void
+}
+
+define i32 @lshl_add_u64_gep(i32 *%p, i64 %a) {
+; GCN-LABEL: lshl_add_u64_gep:
+; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+ %gep = getelementptr inbounds i32, i32* %p, i64 %a
+ %v = load i32, i32* %gep
+ ret i32 %v
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 89129f7764844..57b49ddd843d1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -212,10 +212,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
@@ -227,10 +226,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
@@ -450,13 +448,12 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
-; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -465,13 +462,12 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
-; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index 1c7c502af9f2b..c1c7c0dc32281 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -149,6 +149,22 @@ v_mov_b64 v[2:3], 1
// GFX940: v_mov_b64_e32 v[2:3], 0x64 ; encoding: [0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00]
v_mov_b64 v[2:3], 0x64
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_lshl_add_u64 v[2:3], s[4:5], v7, v[8:9] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x0e,0x22,0x04]
+v_lshl_add_u64 v[2:3], s[4:5], v7, v[8:9]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_lshl_add_u64 v[2:3], v[4:5], 0, 1 ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x01,0x05,0x02]
+v_lshl_add_u64 v[2:3], v[4:5], 0, 1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_lshl_add_u64 v[2:3], v[4:5], 3, s[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x07,0x09,0x00]
+v_lshl_add_u64 v[2:3], v[4:5], 3, s[2:3]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x08,0x09,0x04]
+v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
+
// GFX90A: error: invalid operand for instruction
// GFX10: error: instruction not supported on this GPU
// GFX940: buffer_wbl2 sc1 ; encoding: [0x00,0x80,0xa0,0xe0,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
index cf8844889f19a..0922536d90aff 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
@@ -102,6 +102,18 @@
# GFX940: v_mov_b64_e32 v[2:3], 0x64 ; encoding: [0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00]
0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00
+# GFX940: v_lshl_add_u64 v[2:3], s[4:5], v7, v[8:9] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x0e,0x22,0x04]
+0x02,0x00,0x08,0xd2,0x04,0x0e,0x22,0x04
+
+# GFX940: v_lshl_add_u64 v[2:3], v[4:5], 0, 1 ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x01,0x05,0x02]
+0x02,0x00,0x08,0xd2,0x04,0x01,0x05,0x02
+
+# GFX940: v_lshl_add_u64 v[2:3], v[4:5], 3, s[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x07,0x09,0x00]
+0x02,0x00,0x08,0xd2,0x04,0x07,0x09,0x00
+
+# GFX940: v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x08,0x09,0x04]
+0x02,0x00,0x08,0xd2,0x04,0x08,0x09,0x04
+
# GFX940: buffer_wbl2 sc1 ; encoding: [0x00,0x80,0xa0,0xe0,0x00,0x00,0x00,0x00]
0x00,0x80,0xa0,0xe0,0x00,0x00,0x00,0x00
More information about the llvm-commits
mailing list