[llvm] 2c920a1 - [AMDGPU] wmma_scale* IR verification (#155493)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 26 14:10:23 PDT 2025
Author: Stanislav Mekhanoshin
Date: 2025-08-26T14:10:19-07:00
New Revision: 2c920a11e3af22f4b999020b084ce677da71888f
URL: https://github.com/llvm/llvm-project/commit/2c920a11e3af22f4b999020b084ce677da71888f
DIFF: https://github.com/llvm/llvm-project/commit/2c920a11e3af22f4b999020b084ce677da71888f.diff
LOG: [AMDGPU] wmma_scale* IR verification (#155493)
Added:
Modified:
llvm/lib/IR/Verifier.cpp
llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
Removed:
################################################################################
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 4eb4b58d022e8..9fda08645e118 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6724,7 +6724,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
break;
}
- case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
Value *Src0 = Call.getArgOperand(1);
Value *Src1 = Call.getArgOperand(3);
diff --git a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
index af0d7f1169189..553c64dcd87fe 100644
--- a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
+++ b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
@@ -24,6 +24,46 @@ bb:
ret void
}
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %A
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %B
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %A
+define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %B
+define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
; --------------------------------------------------------------------
; Impossible vector types
; --------------------------------------------------------------------
@@ -48,6 +88,26 @@ bb:
ret void
}
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v15i32.v16i32(i32 0, <15 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <15 x i32> %A
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v15i32_fp8___v16i32_fp8(<15 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v15i32.v16i32(i32 0, <15 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v15i32(i32 0, <16 x i32> %A, i32 0, <15 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <15 x i32> %B
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v15i32_fp8(<16 x i32> %A, <15 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v15i32(i32 0, <16 x i32> %A, i32 0, <15 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
; --------------------------------------------------------------------
; Out of bounds format
; --------------------------------------------------------------------
@@ -72,6 +132,26 @@ bb:
ret void
}
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 5, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: i32 5
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 5, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 5, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: i32 5
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 5, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
; --------------------------------------------------------------------
; Incorrect signature for format cases (IR vector too small)
; --------------------------------------------------------------------
@@ -163,3 +243,47 @@ bb:
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 2, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 2
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v8i32_fp6___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 2, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 2, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 2
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp6(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <16 x i32> %A, i32 2, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 3, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 3
+define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v8i32_bf6___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 3, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 3, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 3
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf6(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <16 x i32> %A, i32 3, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
More information about the llvm-commits
mailing list