[llvm-branch-commits] [clang] [llvm] AMDGPU: Add support for v_ashr_pk_i8/u8_i32 instructions for gfx950 (PR #117596)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 25 13:56:39 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117596
>From 2e0074c4bb6f973aeb201ef52772800b2bb1a810 Mon Sep 17 00:00:00 2001
From: Sirish Pande <Sirish.Pande at amd.com>
Date: Tue, 13 Feb 2024 10:54:51 -0600
Subject: [PATCH] AMDGPU: Add support for v_ashr_pk_i8/u8_i32 instructions for
gfx950
This patch adds assembly and builtin support for v_ashr_pk_i8/u8_i32
instructions.
Co-authored-by: Sirish Pande <Sirish.Pande at amd.com>
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 +
clang/test/CodeGenOpenCL/amdgpu-features.cl | 2 +-
.../CodeGenOpenCL/builtins-amdgcn-gfx950.cl | 46 ++++++++++++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +++
llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +++
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 +
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 +++
llvm/lib/Target/AMDGPU/VOPInstructions.td | 1 +
llvm/lib/TargetParser/TargetParser.cpp | 1 +
llvm/test/MC/AMDGPU/gfx950_asm_vop3.s | 72 +++++++++++++++++++
.../Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 36 ++++++++++
13 files changed, 194 insertions(+), 1 deletion(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index dacbf5aa902f60..fd449697e91216 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -467,6 +467,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 56013dad9b6651..db7fd76ec91189 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 6f3c81b26be0b8..34eae17827ff7c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -169,3 +169,49 @@ void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, f
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale);
}
+
+// CHECK-LABEL: @test_ashr_pk_i8_i32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT: ret void
+//
+void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) {
+ *out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2);
+}
+
+// CHECK-LABEL: @test_ashr_pk_u8_i32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT: ret void
+//
+void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
+ *out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 72298dc298a6fc..b8e07cc799b42e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3199,6 +3199,16 @@ def int_amdgcn_permlane32_swap :
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;
+// llvm.amdgcn.ashr_pk_i8_i32 int vdst, int src0, int src1 int src2
+def int_amdgcn_ashr_pk_i8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_i8_i32">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]>;
+
+// llvm.amdgcn.ashr_pk_u8_i32 int vdst, int src0, int src1 int src2
+def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]>;
+
//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a47206f6bf6730..e4e427ef43339b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -414,12 +414,19 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
"Has f16bf16 to fp6bf6 conversion scale instructions"
>;
+def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
+ "HasAshrPkInsts",
+ "true",
+ "Has Arithmetic Shift Pack instructions"
+>;
+
def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
"GFX950Insts",
"true",
"Additional instructions for GFX950+",
[FeaturePermlane16Swap,
FeaturePermlane32Swap,
+ FeatureAshrPkInsts,
FeatureFP8ConversionScaleInsts,
FeatureBF8ConversionScaleInsts,
FeatureFP4ConversionScaleInsts,
@@ -2474,6 +2481,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;
+def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
+ AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
+
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2fe9e3dd9b18ed..7651c84eb0ae65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4546,6 +4546,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
+ case Intrinsic::amdgcn_ashr_pk_i8_i32:
+ case Intrinsic::amdgcn_ashr_pk_u8_i32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c9cbc546720759..390849dd2e0564 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -245,8 +245,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;
bool HasVmemWriteVgprInOrder = false;
+ bool HasAshrPkInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;
+
bool RequiresCOV6 = false;
// Dummy feature to use for assembler in tablegen.
@@ -1326,6 +1328,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasPermlane16Swap() const { return HasPermlane16Swap; }
bool hasPermlane32Swap() const { return HasPermlane32Swap; }
+ bool hasAshrPkInsts() const { return HasAshrPkInsts; }
bool hasMinimum3Maximum3F32() const {
return HasMinimum3Maximum3F32;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 10af6480f6dfb6..f9cb6bb8d297a9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2856,6 +2856,7 @@ def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
+def VOP_I16_I32_I32_I32 : VOPProfile <[i16, i32, i32, i32]>;
def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 66e3b355418885..11700a4c34f9f3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1189,6 +1189,11 @@ let SubtargetPredicate = HasPseudoScalarTrans in {
def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
}
+let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
+ defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>;
+ defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
+} // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1
+
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
@@ -1978,5 +1983,8 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
}
+defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
+defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;
+
defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">;
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 34c7989b9d0b86..0e19696a32f86f 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1455,6 +1455,7 @@ def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
+def VOP3_OPSEL_ONLY : VOP3Features<0, 1, 0, 0>;
// Packed is misleading, but it enables the appropriate op_sel
// modifiers.
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 6da48da608cc14..23532b9214a892 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -474,6 +474,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["prng-inst"] = true;
Features["permlane16-swap"] = true;
Features["permlane32-swap"] = true;
+ Features["ashr-pk-insts"] = true;
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
index da5566c1049649..5f5e5057117059 100644
--- a/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
@@ -74,3 +74,75 @@ v_bitop3_b16 v5, v1, v2, s3 bitop3:161
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_bitop3_b16 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd2,0x01,0x05,0x0e,0x30]
// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_i8_i32 v2, s4, v7, v8
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_i8_i32 v2, v4, 0, 1
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_i8_i32 v2, v4, 3, s2
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_i8_i32 v2, s4, 4, v2
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_i8_i32 v2, v4, v7, 0.5
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_u8_i32 v2, s4, v7, v8
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_u8_i32 v2, v4, 0, 1
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_u8_i32 v2, v4, 3, s2
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_u8_i32 v2, s4, 4, v2
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_u8_i32 v2, v4, v7, -2.0
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
+// GFX12-ERR: error: instruction not supported on this GPU
+
+v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
+// GFX906-ERR: error: instruction not supported on this GPU
+// GFX940-ERR: error: instruction not supported on this GPU
+// GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
+// GFX12-ERR: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
index 5ae970005a3b86..ccc55352413777 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
@@ -744,6 +744,42 @@
# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20
+# GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
+0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04
+
+# GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
+0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04
+
+# GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
+0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02
+
+# GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
+0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00
+
+# GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
+0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03
+
+# GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
+0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04
+
+# GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
+0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04
+
+# GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
+0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04
+
+# GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
+0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02
+
+# GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
+0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00
+
+# GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
+0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03
+
+# GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
+0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04
+
# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04
More information about the llvm-branch-commits
mailing list