[llvm] AMDGPU: MC support for v_cvt_scalef32_pk32_f32_[fp|bf]6 of gfx950 (PR #117590)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 25 19:18:34 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117590
>From e147cd5e92555264214ce2e8c982b47930a5c09f Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Mon, 8 Apr 2024 01:10:37 -0400
Subject: [PATCH] AMDGPU: MC support for v_cvt_scalef32_pk32_f32_[fp|bf]6 of
gfx950
Co-authored-by: Pravin Jagtap <Pravin.Jagtap at amd.com>
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 14 +++++++++--
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +++
.../Disassembler/AMDGPUDisassembler.cpp | 1 +
.../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 10 ++++++--
llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 1 +
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 22 +++++++++++++++++
llvm/test/MC/AMDGPU/gfx950_asm_features.s | 10 +++++++-
llvm/test/MC/AMDGPU/gfx950_err.s | 24 +++++++++++++++++++
.../Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 6 +++++
10 files changed, 87 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 64e88cf03b429b..15a1bb799804d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -402,11 +402,17 @@ def FeatureFP4ConversionScaleInsts : SubtargetFeature<"fp4-cvt-scale-insts",
"Has fp4 conversion scale instructions"
>;
+def FeatureFP6BF6ConversionScaleInsts : SubtargetFeature<"fp6bf6-cvt-scale-insts",
+ "HasFP6BF6ConversionScaleInsts",
+ "true",
+ "Has fp6 and bf6 conversion scale instructions"
+>;
+
def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
"GFX950Insts",
"true",
"Additional instructions for GFX950+",
- [FeaturePermlane16Swap, FeaturePermlane32Swap, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts]
+ [FeaturePermlane16Swap, FeaturePermlane32Swap, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts, FeatureFP6BF6ConversionScaleInsts]
>;
def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
@@ -1552,7 +1558,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
FeatureBitOp3Insts,
FeatureFP8ConversionScaleInsts,
FeatureBF8ConversionScaleInsts,
- FeatureFP4ConversionScaleInsts
+ FeatureFP4ConversionScaleInsts,
+ FeatureFP6BF6ConversionScaleInsts
])>;
def FeatureISAVersion9_4_0 : FeatureSet<
@@ -2435,6 +2442,9 @@ def HasBF8ConversionScaleInsts : Predicate<"Subtarget->hasBF8ConversionScaleInst
def HasFP4ConversionScaleInsts : Predicate<"Subtarget->hasFP4ConversionScaleInsts()">,
AssemblerPredicate<(all_of FeatureFP4ConversionScaleInsts)>;
+def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionScaleInsts()">,
+ AssemblerPredicate<(all_of FeatureFP6BF6ConversionScaleInsts)>;
+
def HasGDS : Predicate<"Subtarget->hasGDS()">;
def HasGWS : Predicate<"Subtarget->hasGWS()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 20f573da0ec82b..1a09f55dfdb28a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -53,6 +53,7 @@ class AMDGPUSubtarget {
bool HasFP8ConversionScaleInsts = false;
bool HasBF8ConversionScaleInsts = false;
bool HasFP4ConversionScaleInsts = false;
+ bool HasFP6BF6ConversionScaleInsts = false;
bool EnableRealTrue16Insts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
@@ -184,6 +185,8 @@ class AMDGPUSubtarget {
bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
+ bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; }
+
bool hasMadMacF32Insts() const {
return HasMadMacF32Insts || !isGCN();
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 136fe2e3f90d02..fa5f86b0788cc2 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1530,6 +1530,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
case OPWV232: return VReg_64RegClassID;
case OPW96: return VReg_96RegClassID;
case OPW128: return VReg_128RegClassID;
+ case OPW192: return VReg_192RegClassID;
case OPW160: return VReg_160RegClassID;
case OPW256: return VReg_256RegClassID;
case OPW288: return VReg_288RegClassID;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3e20a2ab9e66ca..b19e4b74a394cb 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -219,6 +219,7 @@ class AMDGPUDisassembler : public MCDisassembler {
OPW96,
OPW128,
OPW160,
+ OPW192,
OPW256,
OPW288,
OPW320,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index acb703dba6a980..f20d6526e20b2c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1696,7 +1696,8 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16,
VOPDstOperand_t16Lo128),
VOPDstOperand<VGPR_32>);
- RegisterOperand ret = !cond(!eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>,
+ !eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
!eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
!eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
!eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
@@ -1752,7 +1753,8 @@ class getSOPSrcForVT<ValueType VT> {
// Returns the vreg register class to use for source operand given VT
class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> {
RegisterOperand ret =
- !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
+ !cond(!eq(VT.Size, 192) : RegisterOperand<VReg_192>,
+ !eq(VT.Size, 128) : RegisterOperand<VReg_128>,
!eq(VT.Size, 96) : RegisterOperand<VReg_96>,
!eq(VT.Size, 64) : RegisterOperand<VReg_64>,
!eq(VT.Size, 48) : RegisterOperand<VReg_64>,
@@ -1785,6 +1787,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
!eq(VT, v2i16) : VSrc_v2b16,
!eq(VT, v4f16) : AVSrc_64,
!eq(VT, v4bf16) : AVSrc_64,
+ !eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
!eq(VT.Size, 96) : VRegSrc_96,
!eq(VT.Size, 64) : VSrc_b64,
@@ -2828,6 +2831,9 @@ def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
+def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
+def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
+def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index e3baeed01841ab..11ca4df6e9f445 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1249,6 +1249,7 @@ def VRegSrc_32 : SrcReg9<VGPR_32, "OPW32">;
def VRegSrc_64 : SrcReg9<VReg_64, "OPW64">;
def VRegSrc_96 : SrcReg9<VReg_96, "OPW96">;
def VRegSrc_128: SrcReg9<VReg_128, "OPW128">;
+def VRegSrc_192: SrcReg9<VReg_192, "OPW192">;
def VRegSrc_256: SrcReg9<VReg_256, "OPW256">;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index cf00910210e0bd..1009f2d9593609 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -925,6 +925,19 @@ def VOP3_CVT_SCALE_PK_FP8BF8_F16BF16_Profile : VOP3_Profile<VOPProfile<[i32, v2f
let HasOMod = 0;
}
+class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
+ let HasModifiers = 0;
+ let HasSrc0IntMods = 0;
+ let HasSrc1IntMods = 0;
+ let HasOMod = 0;
+ let HasOpSel = 0;
+ let HasClamp = 0;
+ let HasExtDPP = 0;
+ let HasExt32BitDPP = 0;
+ let HasExtVOP3DPP = 0;
+ let HasExt64BitDPP = 0;
+}
+
let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_F16_FP8 : VOP3Inst<"v_cvt_scalef32_f16_fp8", VOP3_CVT_SCALE_F1632_FP8BF8_Profile<f16>>;
defm V_CVT_SCALEF32_F32_FP8 : VOP3Inst<"v_cvt_scalef32_f32_fp8", VOP3_CVT_SCALE_F1632_FP8BF8_Profile<f32>>;
@@ -950,6 +963,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
}
+let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
+ defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>>;
+ defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>>;
+}
+
let SubtargetPredicate = isGFX10Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -1894,3 +1912,7 @@ defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3OpSel_Real_gfx9 <0x23d>;
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3OpSel_Real_gfx9 <0x250>;
defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3OpSel_Real_gfx9 <0x251>;
}
+let OtherPredicates = [HasFP6BF6ConversionScaleInsts] in {
+defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3_Real_gfx9<0x256, "v_cvt_scalef32_pk32_f32_fp6">;
+defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3_Real_gfx9<0x257, "v_cvt_scalef32_pk32_f32_bf6">;
+}
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
index 85cd02aa714175..95d31d2293075f 100644
--- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
@@ -884,4 +884,12 @@ v_cvt_scalef32_pk_bf16_fp4 v1, v2, s3 op_sel:[1,1,0]
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0] ; encoding: [0x01,0x18,0x51,0xd2,0x02,0x06,0x01,0x00]
-v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0]
\ No newline at end of file
+v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0]
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x56,0xd2,0x02,0x0d,0x02,0x00]
+v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x57,0xd2,0x02,0x0d,0x02,0x00]
+v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6
\ No newline at end of file
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index 89167ae35e2967..6eebd4f7ccd76b 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -125,3 +125,27 @@ v_cvt_scalef32_pk_bf16_fp4 v1, v2, v3 div:2
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_pk_bf16_fp4 v1, v2, v3 clamp div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 mul:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 clamp div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 mul:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 clamp div:2
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
index 80b5835fab1084..73fd3edfbad486 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
@@ -611,3 +611,9 @@
# GFX950: v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0] ; encoding: [0x01,0x18,0x51,0xd2,0x02,0x06,0x01,0x00]
0x01,0x18,0x51,0xd2,0x02,0x06,0x01,0x00
+
+# GFX950: v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x56,0xd2,0x02,0x0d,0x02,0x00]
+0x02,0x00,0x56,0xd2,0x02,0x0d,0x02,0x00
+
+# GFX950: v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x57,0xd2,0x02,0x0d,0x02,0x00]
+0x02,0x00,0x57,0xd2,0x02,0x0d,0x02,0x00
More information about the llvm-commits
mailing list