[clang] [llvm] [AMDGPU] gfx1250 v_wmma_scale[16]_f32_16x16x128_f8f6f4 codegen (PR #152036)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 4 14:37:56 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
---
Patch is 174.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152036.diff
14 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2)
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+10)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl (+24)
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl (+30)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+26)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+3-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+2)
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll (+1198)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll (+216)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll (+134)
- (modified) llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir (+216)
- (modified) llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll (+312)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index ced758c814105..91fe8a812920e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -790,6 +790,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbI
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4, "V8fIiV16iIiV16iIsV8fIiIiiIiIiiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, "V8fIiV16iIiV16iIsV8fIiIiLiIiIiLiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 70f510ae52ed6..d5b929987ce3d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -892,6 +892,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
@@ -1158,6 +1160,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {5, 1, 3};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+ ArgsForMatchingMatrixTypes = {5, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
+ ArgsForMatchingMatrixTypes = {5, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4;
+ break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
ArgsForMatchingMatrixTypes = {3, 0, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
index 86c27d48ab0d4..af8a457bec686 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -169,6 +169,30 @@ void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c);
}
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-GFX1250-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int scale_src0, int scale_src1)
+{
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-GFX1250-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, long scale_src0, long scale_src1)
+{
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
index 8aa7c34672783..83fdc8c8c60ba 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -121,6 +121,36 @@ void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f
*out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
}
+void test_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int mod, int scale_src0, int scale_src1, bool reuse)
+{
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(mod, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, mod, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, 1); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int mod, long scale_src0, long scale_src1, bool reuse)
+{
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(mod, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, mod, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, 1); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4' must be a constant integer}}
+}
+
void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, int mod)
{
*out = __builtin_amdgcn_wmma_f32_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 469bdb409aaff..bfadc6a58f7f1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3932,6 +3932,30 @@ class AMDGPUWmmaIntrinsicModsC_MatrixFMT :
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
+class AMDGPUWmmaScaleIntrinsicModsC<LLVMType scale_ty> :
+ Intrinsic<
+ [llvm_anyfloat_ty], // %D
+ [
+ llvm_i32_ty, // matrix_a_fmt
+ llvm_anyint_ty, // %A
+ llvm_i32_ty, // matrix_b_fmt
+ llvm_anyint_ty, // %B
+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+ LLVMMatchType<0>, // %C
+ llvm_i32_ty, // matrix_a_scale
+ llvm_i32_ty, // matrix_a_scale_fmt
+ scale_ty, // matrix a scale exponential
+ llvm_i32_ty, // matrix_b_scale
+ llvm_i32_ty, // matrix_b_scale_fmt
+ scale_ty, // matrix b scale exponential
+ llvm_i1_ty, // matrix_a_reuse
+ llvm_i1_ty, // matrix_b_reuse
+ ],
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>,
+ ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<9>>, ImmArg<ArgIndex<10>>, ImmArg<ArgIndex<12>>, ImmArg<ArgIndex<13>>,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@@ -3957,6 +3981,8 @@ def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint
def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
+def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i32_ty>;
+def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i64_ty>;
def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index f2207ff4cb1c4..4fe5d00679436 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1694,7 +1694,9 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
- case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
Value *Src0 = II.getArgOperand(1);
Value *Src1 = II.getArgOperand(3);
unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d11e5a3c4e3cf..74230a543ef16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4798,6 +4798,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 5d186c337a99c..bfd88c27657e5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -2009,6 +2009,8 @@ let SubtargetPredicate = isGFX125xOnly in {
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
+ defm : WMMAPat<"V_WMMA_SCALE_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE_" # I # "_w32")>;
+ defm : WMMAPat<"V_WMMA_SCALE16_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE16_" # I # "_w32")>;
}
def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 30b74cb4804d5..099fd2fbc7ca1 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -310,6 +310,22 @@ bb:
ret void
}
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 1, i32 0, i32 1, i32 1, i32 0, i32 1, i1 false, i1 false)
+define amdgpu_ps void @wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 1, i32 0, i32 1, i32 1, i32 0, i32 1, i1 false, i1 false)
+ store <8 x float> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 1, i32 0, i64 1, i32 1, i32 0, i64 1, i1 false, i1 false)
+define amdgpu_ps void @wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 1, i32 0, i64 1, i32 1, i32 0, i64 1, i1 false, i1 false)
+ store <8 x float> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
; CHRCK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
@@ -880,6 +896,8 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>,
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32b...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/152036
More information about the llvm-commits
mailing list