[clang] 4a78225 - [AMDGPU] Add WMMA clang builtins
Piotr Sobczak via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 30 23:56:06 PDT 2022
Author: Piotr Sobczak
Date: 2022-07-01T08:55:25+02:00
New Revision: 4a782252127761b60d33e74f9d9acb0aad6f742f
URL: https://github.com/llvm/llvm-project/commit/4a782252127761b60d33e74f9d9acb0aad6f742f
DIFF: https://github.com/llvm/llvm-project/commit/4a782252127761b60d33e74f9d9acb0aad6f742f.diff
LOG: [AMDGPU] Add WMMA clang builtins
Add WMMA clang builtins and tests. Extra changes in code
are needed to handle function overloads.
WavefrontSize 32:
__builtin_amdgcn_wmma_f32_16x16x16_f16_w32
__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32
__builtin_amdgcn_wmma_f16_16x16x16_f16_w32
__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32
__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32
__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32
WavefrontSize 64:
__builtin_amdgcn_wmma_f32_16x16x16_f16_w64
__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64
__builtin_amdgcn_wmma_f16_16x16x16_f16_w64
__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64
__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64
__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D128952
Added:
clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/CGBuiltin.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index bd188c7f34371..68bcf546d177c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -252,6 +252,30 @@ TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4hV4hV4Ui
TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts")
TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts")
+
+//===----------------------------------------------------------------------===//
+// GFX11+ only builtins.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// WMMA builtins.
+// Postfix w32 indicates the builtin requires wavefront size of 32.
+// Postfix w64 indicates the builtin requires wavefront size of 64.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts")
+
//===----------------------------------------------------------------------===//
// Special builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b22d1f76c1a1d..8c7ee6b078f2e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16861,6 +16861,69 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
RayInverseDir, TextureDescr});
}
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
+
+ // These operations perform a matrix multiplication and accumulation of
+ // the form:
+ // D = A * B + C
+ // The return type always matches the type of matrix C.
+ unsigned ArgForMatchingRetType;
+ unsigned BuiltinWMMAOp;
+
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
+ ArgForMatchingRetType = 2;
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
+ ArgForMatchingRetType = 2;
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
+ ArgForMatchingRetType = 2;
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+ ArgForMatchingRetType = 2;
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
+ ArgForMatchingRetType = 4;
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
+ ArgForMatchingRetType = 4;
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
+ break;
+ }
+
+ SmallVector<Value *, 6> Args;
+ for (int i = 0, e = E->getNumArgs(); i != e; ++i)
+ Args.push_back(EmitScalarExpr(E->getArg(i)));
+
+ Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
+ {Args[ArgForMatchingRetType]->getType()});
+
+ return Builder.CreateCall(F, Args);
+ }
+
// amdgcn workitem
case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
new file mode 100644
index 0000000000000..b392750f50844
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -0,0 +1,82 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
+
+typedef float v4f __attribute__((ext_vector_type(4)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v16h __attribute__((ext_vector_type(16)));
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+typedef short v16s __attribute__((ext_vector_type(16)));
+
+#ifdef WMMA_GFX1100_TESTS
+
+// Wave32
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32
+// CHECK-GFX1100: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %a, <16 x half> %b, <8 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32
+// CHECK-GFX1100: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %a, <16 x i16> %b, <8 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32
+// CHECK-GFX1100: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, i1 true)
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32
+// CHECK-GFX1100: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, i1 true)
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v16s c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32
+// CHECK-GFX1100: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> %a, i1 true, <4 x i32> %b, <8 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32
+// CHECK-GFX1100: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> %a, i1 true, <2 x i32> %b, <8 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a, true, b, c, false);
+}
+
+#endif
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
new file mode 100644
index 0000000000000..99664b2443288
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -0,0 +1,84 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
+
+typedef float v4f __attribute__((ext_vector_type(4)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef half v16h __attribute__((ext_vector_type(16)));
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+typedef short v16s __attribute__((ext_vector_type(16)));
+
+#ifdef WMMA_GFX1100_TESTS
+
+// Wave64
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64
+// CHECK-GFX1100: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %a, <16 x half> %b, <4 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64
+// CHECK-GFX1100: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %a, <16 x i16> %b, <4 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64
+// CHECK-GFX1100: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %a, <16 x half> %b, <8 x half> %c, i1 true)
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64
+// CHECK-GFX1100: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %a, <16 x i16> %b, <8 x i16> %c, i1 true)
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8s c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64
+// CHECK-GFX1100: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> %a, i1 true, <4 x i32> %b, <4 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64
+// CHECK-GFX1100: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> %a, i1 true, <2 x i32> %b, <4 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, v2i a, v2i b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a, true, b, c, false);
+}
+
+#endif
More information about the cfe-commits
mailing list