[llvm] [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on supported subtargets (PR #67461)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 26 10:15:51 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mc
<details>
<summary>Changes</summary>
In order to avoid duplicating every dpp pseudo opcode that has src1, we allow it for all opcodes and add manual checks on subtargets that do not support it.
---
Full diff: https://github.com/llvm/llvm-project/pull/67461.diff
11 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+9-1)
- (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+26-9)
- (modified) llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp (+9-1)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+5)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1-1)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1)
- (modified) llvm/test/CodeGen/AMDGPU/dpp_combine.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir (+47-3)
- (added) llvm/test/MC/AMDGPU/gfx1150_asm_features.s (+20)
- (modified) llvm/test/MC/AMDGPU/gfx11_asm_err.s (+3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d5356d1be3d758a..924144c5366f37e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -472,6 +472,12 @@ def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
"Support DPP (Data Parallel Primitives) extension in DP ALU"
>;
+def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
+ "HasDPPSrc1SGPR",
+ "true",
+ "Support SGPR for Src1 of DPP instructions"
+>;
+
def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
"HasPackedFP32Ops",
"true",
@@ -1383,11 +1389,13 @@ def FeatureISAVersion11_0_3 : FeatureSet<
def FeatureISAVersion11_5_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [FeatureSALUFloatInsts])>;
+ [FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR])>;
def FeatureISAVersion11_5_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR,
FeatureGFX11FullVGPRs])>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5032dd665b07e81..35656bcaea1af7f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4231,16 +4231,33 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
- if (DppCtrlIdx < 0)
- return true;
- unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+ if (DppCtrlIdx >= 0) {
+ unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+
+ if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
+ AMDGPU::isDPALU_DPP(MII.get(Opc))) {
+ // DP ALU DPP is supported for row_newbcast only on GFX9*
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
+ Error(S, "DP ALU dpp only supports row_newbcast");
+ return false;
+ }
+ }
- if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
- AMDGPU::isDPALU_DPP(MII.get(Opc))) {
- // DP ALU DPP is supported for row_newbcast only on GFX9*
- SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
- Error(S, "DP ALU dpp only supports row_newbcast");
- return false;
+ int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8);
+ bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0;
+
+ if (IsDPP && !hasDPPSrc1SGPR(getSTI())) {
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx >= 0) {
+ const MCOperand &Src1 = Inst.getOperand(Src1Idx);
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ if (Src1.isImm() ||
+ (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
+ Error(Op.getStartLoc(), "invalid operand for instruction");
+ return false;
+ }
+ }
}
return true;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e1cd0f0a732b93e..3e516a6e40b372a 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -278,6 +278,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
+ int Src0Idx = NumOperands;
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
Fail = true;
@@ -301,7 +302,14 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
if (Src1) {
- if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
+ int OpNum = NumOperands;
+ // If subtarget does not support SGPRs for src1 operand then the
+ // requirements are the same as for src0. We check src0 instead because
+ // pseudos are shared between subtargets and allow SGPR for src1 on all.
+ if (!ST->hasDPPSrc1SGPR())
+ OpNum = Src0Idx;
+
+ if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
break;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f60938d830fcc43..c73cf6b2bd815ff 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -128,6 +128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasDPP = false;
bool HasDPP8 = false;
bool HasDPALU_DPP = false;
+ bool HasDPPSrc1SGPR = false;
bool HasPackedFP32Ops = false;
bool HasImageInsts = false;
bool HasExtendedImageInsts = false;
@@ -916,6 +917,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasDPALU_DPP;
}
+ bool hasDPPSrc1SGPR() const {
+ return HasDPPSrc1SGPR;
+ }
+
bool hasPackedFP32Ops() const {
return HasPackedFP32Ops;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ea6df380153bb86..60a6964c754ff64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2296,7 +2296,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
- field RegisterOperand Src1VOP3DPP = VRegSrc_32;
+ field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index da664c93d188963..6d0ad763d9e6cc1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureVOPD);
}
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
+ return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);
+}
+
unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureKernargPreload);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d5092333c217127..297a69f54d63721 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1169,6 +1169,7 @@ bool isGFX940(const MCSubtargetInfo &STI);
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
bool hasMAIInsts(const MCSubtargetInfo &STI);
bool hasVOPD(const MCSubtargetInfo &STI);
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
index 4d979e2cf74962c..a72bcbb529e7d9f 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 91e97a36a597a08..59649263f19a577 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -1,4 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
---
@@ -6,7 +7,8 @@
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
-# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
+# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
name: vop3
tracksRegLiveness: true
body: |
@@ -28,11 +30,53 @@ body: |
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
- ; should not be combined because src1 imm is illegal
+ ; should not be combined on subtargets where src1 imm is illegal
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
...
+# GCN-label: name: vop3_sgpr_src1
+# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
+name: vop3_sgpr_src1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:sgpr_32 = COPY $sgpr0
+ %3:sgpr_32 = COPY $sgpr1
+ %4:vgpr_32 = IMPLICIT_DEF
+
+ ; should be combined because src2 allows sgpr
+ %5:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %6:vgpr_32 = V_MED3_F32_e64 0, %5, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+
+ ; should be combined only on subtargets that allow sgpr for src1
+ %7:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+
+ ; should be combined only on subtargets that allow sgpr for src1
+ %9:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+
+ ; should be combined only on subtargets that allow inlinable constants for src1
+ %11:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
+
+ ; should not be combined when literal constants are used
+ %13:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
+...
+
# Regression test for src_modifiers on base u16 opcode
# GCN-label: name: vop3_u16
# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
new file mode 100644
index 000000000000000..ab7dda8485570d9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
@@ -0,0 +1,20 @@
+
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s
+
+//
+// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable
+// constant.
+//
+
+v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+
+v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index e475a1d5190777e..6a3a0ca40bfb9ba 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -45,6 +45,9 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
``````````
</details>
https://github.com/llvm/llvm-project/pull/67461
More information about the llvm-commits
mailing list