[llvm] AMDGPU: Allow only VGPR wide sources in fp6/4/8 conversions (PR #127464)
Pravin Jagtap via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 17 02:28:29 PST 2025
https://github.com/pravinjagtap updated https://github.com/llvm/llvm-project/pull/127464
>From 824b47b29580fe5d8be3d93c50c6c31deabc9e34 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Mon, 17 Feb 2025 15:15:17 +0530
Subject: [PATCH 1/2] AMDGPU: Allow only VGPR wide sources in fp6/4/8
conversions
All convert opcodes operating on FP6/BF6/FP4
data must use VGPR sources for any operand slots
providing more than 32-bits of data.
Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af
---
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 13 +++++
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 ++++-
llvm/test/MC/AMDGPU/gfx950_err.s | 63 ++++++++++++++++++++++
3 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bb78e77a9dc1a..e2d0b28a391fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1818,6 +1818,18 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
1 : RegisterOperand<VGPR_32>);
}
+// VGPR only VOP3 src with 9 bit encoding
+class getVOP3VSrcReg9ForVT<ValueType VT> {
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
+ !eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 256) : VRegSrc_256,
+ !eq(VT.Size, 192) : VRegSrc_192,
+ !eq(VT.Size, 128) : VRegSrc_128,
+ !eq(VT.Size, 96) : VRegSrc_96,
+ !eq(VT.Size, 64) : VRegSrc_64,
+ 1 : VRegSrc_32);
+}
+
// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
RegisterOperand ret =
@@ -2852,6 +2864,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
+def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>;
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index afafc2ecccfaf..6a6d8bd94826f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1052,7 +1052,10 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasFP4DstByteSel = 1;
}
-def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
+class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ getVOP3SrcForVT<P.Src0VT>.ret);
let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
Int32InputMods: $src1_modifiers, Src1RC64:$src1,
FP32InputMods: $src2_modifiers, Src2RC64:$src2,
@@ -1100,6 +1103,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
let HasExt32BitDPP = 0;
let HasExtVOP3DPP = 0;
let HasExt64BitDPP = 0;
+
+ // All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
+ // any operand slots > 32 bit.
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ getVOP3SrcForVT<P.Src0VT>.ret);
}
let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1141,7 +1149,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+ defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOP_I32_V2F32_I32_F32>>;
}
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index e0b832d8fe297..099916f48b5e7 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -434,3 +434,66 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 clamp
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_fp6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_bf6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5
+v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5
>From 13e7dde79eabc0f1502a29cb0730505fc401d514 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Mon, 17 Feb 2025 15:57:56 +0530
Subject: [PATCH 2/2] clang-format
---
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 16 ++++++++--------
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 ++++++++----
2 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e2d0b28a391fb..576f6a2547fb8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1820,14 +1820,14 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
// VGPR only VOP3 src with 9 bit encoding
class getVOP3VSrcReg9ForVT<ValueType VT> {
- RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
- !eq(VT.Size, 512) : VRegSrc_512,
- !eq(VT.Size, 256) : VRegSrc_256,
- !eq(VT.Size, 192) : VRegSrc_192,
- !eq(VT.Size, 128) : VRegSrc_128,
- !eq(VT.Size, 96) : VRegSrc_96,
- !eq(VT.Size, 64) : VRegSrc_64,
- 1 : VRegSrc_32);
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024): VRegSrc_1024,
+ !eq(VT.Size, 512): VRegSrc_512,
+ !eq(VT.Size, 256): VRegSrc_256,
+ !eq(VT.Size, 192): VRegSrc_192,
+ !eq(VT.Size, 128): VRegSrc_128,
+ !eq(VT.Size, 96): VRegSrc_96,
+ !eq(VT.Size, 64): VRegSrc_64,
+ 1: VRegSrc_32);
}
// Src2 of VOP3 DPP instructions cannot be a literal
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 6a6d8bd94826f..93bce75f72deb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1052,10 +1052,11 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasFP4DstByteSel = 1;
}
-class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P>
+ : VOP3_Profile<P, VOP3_OPSEL> {
let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
- getVOP3SrcForVT<P.Src0VT>.ret);
+ getVOP3SrcForVT<P.Src0VT>.ret);
let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
Int32InputMods: $src1_modifiers, Src1RC64:$src1,
FP32InputMods: $src2_modifiers, Src2RC64:$src2,
@@ -1107,7 +1108,7 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
// All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
// any operand slots > 32 bit.
let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
- getVOP3SrcForVT<P.Src0VT>.ret);
+ getVOP3SrcForVT<P.Src0VT>.ret);
}
let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1149,7 +1150,10 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOP_I32_V2F32_I32_F32>>;
+ defm V_CVT_SCALEF32_SR_PK_FP4_F32
+ : VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32",
+ VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<
+ VOP_I32_V2F32_I32_F32>>;
}
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
More information about the llvm-commits
mailing list