[clang] 4a78225 - [AMDGPU] Add WMMA clang builtins

Thu Jun 30 23:56:06 PDT 2022

Author: Piotr Sobczak
Date: 2022-07-01T08:55:25+02:00
New Revision: 4a782252127761b60d33e74f9d9acb0aad6f742f

URL: https://github.com/llvm/llvm-project/commit/4a782252127761b60d33e74f9d9acb0aad6f742f
DIFF: https://github.com/llvm/llvm-project/commit/4a782252127761b60d33e74f9d9acb0aad6f742f.diff

LOG: [AMDGPU] Add WMMA clang builtins

Add WMMA clang builtins and tests. Extra changes in code
are needed to handle function overloads.

WavefrontSize 32:
__builtin_amdgcn_wmma_f32_16x16x16_f16_w32
__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32
__builtin_amdgcn_wmma_f16_16x16x16_f16_w32
__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32
__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32
__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32

WavefrontSize 64:
__builtin_amdgcn_wmma_f32_16x16x16_f16_w64
__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64
__builtin_amdgcn_wmma_f16_16x16x16_f16_w64
__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64
__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64
__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D128952

Added: 
    clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl

Modified: 
    clang/include/clang/Basic/BuiltinsAMDGPU.def
    clang/lib/CodeGen/CGBuiltin.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index bd188c7f34371..68bcf546d177c 100644

--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -252,6 +252,30 @@ TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4hV4hV4Ui
 TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts")
 
+
+//===----------------------------------------------------------------------===//
+// GFX11+ only builtins.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// WMMA builtins.
+// Postfix w32 indicates the builtin requires wavefront size of 32.
+// Postfix w64 indicates the builtin requires wavefront size of 64.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts")
+
 //===----------------------------------------------------------------------===//
 // Special builtins.
 //===----------------------------------------------------------------------===//

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b22d1f76c1a1d..8c7ee6b078f2e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16861,6 +16861,69 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                   RayInverseDir, TextureDescr});
   }
 
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
+
+    // These operations perform a matrix multiplication and accumulation of
+    // the form:
+    //             D = A * B + C
+    // The return type always matches the type of matrix C.
+    unsigned ArgForMatchingRetType;
+    unsigned BuiltinWMMAOp;
+
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
+      ArgForMatchingRetType = 4;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
+      ArgForMatchingRetType = 4;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
+      break;
+    }
+
+    SmallVector<Value *, 6> Args;
+    for (int i = 0, e = E->getNumArgs(); i != e; ++i)
+      Args.push_back(EmitScalarExpr(E->getArg(i)));
+
+    Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
+                                   {Args[ArgForMatchingRetType]->getType()});
+
+    return Builder.CreateCall(F, Args);
+  }
+
   // amdgcn workitem
   case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
new file mode 100644
index 0000000000000..b392750f50844
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -0,0 +1,82 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+
+#ifdef WMMA_GFX1100_TESTS
+
+// Wave32
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32
+// CHECK-GFX1100: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %a, <16 x half> %b, <8 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32
+// CHECK-GFX1100: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %a, <16 x i16> %b, <8 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32
+// CHECK-GFX1100: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, i1 true)
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32
+// CHECK-GFX1100: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, i1 true)
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v16s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32
+// CHECK-GFX1100: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> %a, i1 true, <4 x i32> %b, <8 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32
+// CHECK-GFX1100: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> %a, i1 true, <2 x i32> %b, <8 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a, true, b, c, false);
+}
+
+#endif

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
new file mode 100644
index 0000000000000..99664b2443288
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -0,0 +1,84 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+
+#ifdef WMMA_GFX1100_TESTS
+
+// Wave64
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64
+// CHECK-GFX1100: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %a, <16 x half> %b, <4 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64
+// CHECK-GFX1100: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %a, <16 x i16> %b, <4 x float> %c)
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64
+// CHECK-GFX1100: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %a, <16 x half> %b, <8 x half> %c, i1 true)
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64
+// CHECK-GFX1100: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %a, <16 x i16> %b, <8 x i16> %c, i1 true)
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64
+// CHECK-GFX1100: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> %a, i1 true, <4 x i32> %b, <4 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64
+// CHECK-GFX1100: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> %a, i1 true, <2 x i32> %b, <4 x i32> %c, i1 false)
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, v2i a, v2i b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a, true, b, c, false);
+}
+
+#endif