[llvm] d9972f8 - [NVPTX] Added llvm.nvvm.mma.m8n8k4.* intrinsics
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 28 14:09:10 PDT 2019
Author: Artem Belevich
Date: 2019-10-28T13:55:30-07:00
New Revision: d9972f848294b06807c8764615852ba2bc1e8a74
URL: https://github.com/llvm/llvm-project/commit/d9972f848294b06807c8764615852ba2bc1e8a74
DIFF: https://github.com/llvm/llvm-project/commit/d9972f848294b06807c8764615852ba2bc1e8a74.diff
LOG: [NVPTX] Added llvm.nvvm.mma.m8n8k4.* intrinsics
Differential Revision: https://reviews.llvm.org/D69324
Added:
Modified:
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/wmma.py
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 0483d965ba64..ec328d69a8dd 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -53,6 +53,10 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
string gft = Geom#":"#Frag#":"#ptx_elt_type;
string ft = frag#":"#ptx_elt_type;
list<LLVMType> regs = !cond(
+ // mma.sync.m8n8k4 uses smaller a/b fragments than wmma fp ops
+ !eq(gft,"m8n8k4:a:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,
+ !eq(gft,"m8n8k4:b:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,
+
// fp16 -> fp16/fp32 @ m16n16k16/m8n32k16/m32n8k16
// All currently supported geometries use the same fragment format,
// so we only need to consider {fragment, type}.
@@ -137,13 +141,19 @@ class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
class WMMA_NAME_MMA<string ALayout, string BLayout, int Satfinite,
WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
string signature = MMA_SIGNATURE<A, B, C, D>.ret;
- string llvm = "llvm.nvvm.wmma."
- # A.geom
- # ".mma"
- # "." # ALayout
- # "." # BLayout
- # signature
- # !if(Satfinite, ".satfinite", "");
+ string llvm = !if(
+ !eq(A.geom, "m8n8k4"),
+ "llvm.nvvm.mma.m8n8k4"
+ # "." # ALayout
+ # "." # BLayout
+ # signature,
+ "llvm.nvvm.wmma."
+ # A.geom
+ # ".mma"
+ # "." # ALayout
+ # "." # BLayout
+ # signature
+ # !if(Satfinite, ".satfinite", ""));
string record = !subst(".", "_",
!subst("llvm.", "int_", llvm));
@@ -160,7 +170,7 @@ class MMA_OPS<list<string> Geom, list<string> TypeA, list<string> TypeB,
!foldl([]<list<WMMA_REGS>>, TypeA, t2, type_a, !listconcat(t2,
!foldl([]<list<WMMA_REGS>>, !if(!size(TypeB), TypeB, [type_a]), t3, type_b, !listconcat(t3,
!foldl([]<list<WMMA_REGS>>, TypeC, t4, type_c, !listconcat(t4,
- !foldl([]<list<WMMA_REGS>>, !if(!size(TypeC), TypeC, [type_c]), t5, type_d, !listconcat(t5,
+ !foldl([]<list<WMMA_REGS>>, !if(!size(TypeD), TypeD, [type_c]), t5, type_d, !listconcat(t5,
[[WMMA_REGS<geom, "a", type_a>,
WMMA_REGS<geom, "b", type_b>,
WMMA_REGS<geom, "c", type_c>,
@@ -185,19 +195,23 @@ class MMA_LDST_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
// drives generation of corresponding intrinsics and instructions.
class NVVM_MMA_OPS<int _ = 0> {
list<list<WMMA_REGS>> fp_mma_ops = MMA_OPS<
+ ["m8n8k4"],
+ ["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret;
+ list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS<
["m16n16k16", "m32n8k16", "m8n32k16"],
["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret;
- list<list<WMMA_REGS>> int_mma_ops = MMA_OPS<
+ list<list<WMMA_REGS>> int_wmma_ops = MMA_OPS<
["m16n16k16", "m32n8k16", "m8n32k16"],
["s8", "u8"], [], ["s32"], []>.ret;
- list<list<WMMA_REGS>> subint_mma_ops = MMA_OPS<
+ list<list<WMMA_REGS>> subint_wmma_ops = MMA_OPS<
["m8n8k32"],
["s4", "u4"], [], ["s32"], []>.ret;
- list<list<WMMA_REGS>> bit_mma_ops = MMA_OPS<
+ list<list<WMMA_REGS>> bit_wmma_ops = MMA_OPS<
["m8n8k128"],
["b1"], [], ["s32"], []>.ret;
- list<list<WMMA_REGS>> all_mma_ops = !listconcat(fp_mma_ops, int_mma_ops,
- subint_mma_ops, bit_mma_ops);
+ list<list<WMMA_REGS>> all_mma_ops = !listconcat(
+ fp_mma_ops, fp_wmma_ops, int_wmma_ops,
+ subint_wmma_ops, bit_wmma_ops);
list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS<
["m16n16k16", "m32n8k16", "m8n32k16"],
@@ -245,10 +259,25 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
# ":" # frags[0].frag
;
string t = frags[0].ptx_elt_type;
+
+ // gcd is a shortcut used to identify instructions that depend on
+ // geom+frag_c+frag_d. Not all instances of this class have all fragments
+ // specified. If there are not enough fragments, the tail evaluates to '?'.
+ string gcd = frags[0].geom
+ # ":"
+ # !if(!eq(!size(frags), 4),
+ frags[2].ptx_elt_type # frags[3].ptx_elt_type,
+ "?");
list<int> ret = !cond(
// Sub-int MMA only supports fixed A/B layout.
// b1 does not support .satf.
!eq(mma#":"#satf, "b1:row:col:0") : [1],
+ // mma.m8n8k4 has no .satf modifier.
+ !and(!eq(frags[0].geom, "m8n8k4"),
+ !ne(satf, 0)): [],
+
+ // mma.m8n8k4 has no C=f32 D=f16 variant.
+ !eq(gcd, "m8n8k4:f32f16"): [],
!eq(mma, "s4:row:col") : [1],
!eq(mma, "u4:row:col") : [1],
!eq(mma, "s4:row:col") : [1],
@@ -4094,7 +4123,7 @@ class NVVM_WMMA_ST<WMMA_REGS Frag, string Layout, int WithStride>
[IntrWriteMem, IntrArgMemOnly, WriteOnly<0>, NoCapture<0>],
WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.intr>;
-// Create all load/store variants
+// Create all load/store variants
foreach layout = ["row", "col"] in {
foreach stride = [0, 1] in {
foreach frag = NVVM_MMA_OPS.all_ld_ops in
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c52195fb0449..76a4a1d4030a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -7400,7 +7400,9 @@ class WMMA_REGINFO<WMMA_REGS r>
// u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
!or(!eq(geom,"m8n8k128"),
- !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63]);
+ !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63],
+
+ !eq(geom, "m8n8k4") : [hasSM70, hasPTX64]);
// template DAGs for instruction inputs/output.
dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -7546,25 +7548,37 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
let OutOperandList = FragD.Outs;
let InOperandList = !con(Args, (ins MmaCode:$ptx));
string TypeList = !cond(
+ !eq(FragD.geom, "m8n8k4") : "." # FragD.ptx_elt_type
+ # ".f16.f16."
+ # FragC.ptx_elt_type,
!eq(FragD.ptx_elt_type, "s32") : ".s32"
# "." # FragA.ptx_elt_type
# "." # FragB.ptx_elt_type
# ".s32",
1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type,
);
- let AsmString = "wmma.mma"
- # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "")
- # ".sync"
- # "${ptx:aligned}"
- # "." # ALayout
- # "." # BLayout
- # "." # FragA.geom
- # TypeList
- # !if(Satfinite, ".satfinite", "") # "\n\t\t"
- # FragD.regstring # ",\n\t\t"
- # FragA.regstring # ",\n\t\t"
- # FragB.regstring # ",\n\t\t"
- # FragC.regstring # ";";
+ let AsmString = !if(!eq(FragA.geom, "m8n8k4"),
+ "mma.sync.aligned.m8n8k4"
+ # "." # ALayout
+ # "." # BLayout
+ # TypeList # "\n\t\t"
+ # FragD.regstring # ",\n\t\t"
+ # FragA.regstring # ",\n\t\t"
+ # FragB.regstring # ",\n\t\t"
+ # FragC.regstring # ";",
+ "wmma.mma"
+ # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "")
+ # ".sync"
+ # "${ptx:aligned}"
+ # "." # ALayout
+ # "." # BLayout
+ # "." # FragA.geom
+ # TypeList
+ # !if(Satfinite, ".satfinite", "") # "\n\t\t"
+ # FragD.regstring # ",\n\t\t"
+ # FragA.regstring # ",\n\t\t"
+ # FragB.regstring # ",\n\t\t"
+ # FragC.regstring # ";");
}
defset list<WMMA_INSTR> MMAs = {
diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index 5123d1db9249..8c140c4d9310 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -4,39 +4,47 @@
# Check all variants of instructions supported by PTX60 on SM70
# RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll
# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX60,SM70
+# RUN: --check-prefixes=INTRINSICS,M16N16
# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX60U,SM70U
+# RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA
# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
# RUN: | FileCheck %t-ptx60-sm_70.ll
# Check all variants of instructions supported by PTX61 on SM70
# RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,SM70
+# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM
# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX61U,SM70U
+# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA
# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
# RUN: | FileCheck %t-ptx61-sm_70.ll
# Check all variants of instructions supported by PTX63 on SM72
# RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72
+# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT
# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX63U,SM72U
+# RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA
# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
# RUN: | FileCheck %t-ptx63-sm_72.ll
# Check all variants of instructions supported by PTX63 on SM75
# RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72,SM75
+# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT
# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
-# RUN: --check-prefixes=INTRINSICS,PTX63U,SM75U
+# RUN: --check-prefixes=INTRINSICS,NOMMA
# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
# RUN: | FileCheck %t-ptx63-sm_75.ll
+# Check all variants of instructions supported by PTX64 on SM70+
+# RUN: python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll
+# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
+# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA
+# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
+# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT
+# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
+# RUN: | FileCheck %t-ptx64-sm_70.ll
from __future__ import print_function
@@ -70,10 +78,11 @@ class MMAFrag:
def __init__(self, geom, frag, ptx_elt_type):
self.geom = geom
self.frag = frag
+ self.is_mma = True if geom == "m8n8k4" else False;
self.mma_type = MMAType(ptx_elt_type);
self.nregs = {
- "a:f16" : 8,
- "b:f16" : 8,
+ "a:f16" : 2 if self.is_mma else 8,
+ "b:f16" : 2 if self.is_mma else 8,
"c:f16" : 4,
"d:f16" : 4,
"c:f32" : 8,
@@ -145,7 +154,9 @@ def make_ldst_ops(geoms, frags, types):
in product(geoms, frags, types)]
def get_mma_ops():
- return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+ return (make_mma_ops(["m8n8k4"],
+ ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+ make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
["s8", "u8"], [], ["s32"], []) +
@@ -165,6 +176,8 @@ def get_ldst_ops(kind):
def is_geom_supported(geom):
# geometries for FP and ints.
+ if geom == "m8n8k4":
+ return ptx_version >= 64
if geom in ["m8n32k16", "m32n8k16"]:
return ptx_version >= 61
# geometries for sub-ints.
@@ -186,6 +199,13 @@ def is_mma_variant_supported(op, layout_a, layout_b, satf):
if not (is_type_supported(op.a.mma_type.ptx_type)
and is_geom_supported(op.a.geom)):
return False
+ if op.a.geom == "m8n8k4":
+ if satf:
+ return False
+ if op.c.mma_type.ptx_type == "f32":
+ # If C is f32, D must be, too.
+ return op.d.mma_type.ptx_type == "f32"
+
# sub-integer require row/col layout, and no satf.
if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]:
if op.a.mma_type.ptx_type == "b1" and satf:
@@ -232,8 +252,6 @@ def get_pspace(space):
def check_pattern(frag):
return "{{%s}}" % ", *".join([frag.mma_type.ptx_reg_pattern] * frag.nregs)
-known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"]
-
def gen_wmma_load_tests():
load_template = """
declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
@@ -389,6 +407,8 @@ def mma_ptx_signature(op):
if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
# int and sub-int instructions encode all four types as D.A.B.C
return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))
+ if op.a.geom == "m8n8k4":
+ return "%s.f16.f16.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
else:
# the rest are FP instructions use D.C
return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
@@ -411,8 +431,10 @@ def gen_wmma_mma_tests():
ret ${ret_ty} %r;
}
"""
- intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
- instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
+ wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
+ wmma_instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
+ mma_intrinsic_template = "llvm.nvvm.mma.${geom}.${alayout}.${blayout}.${intrinsic_signature}"
+ mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}.${ptx_signature}"
generated_items=[]
@@ -436,6 +458,13 @@ def gen_wmma_mma_tests():
"mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "",
}
+ if op.a.geom == "m8n8k4":
+ intrinsic_template = mma_intrinsic_template
+ instruction_template = mma_instruction_template
+ else:
+ intrinsic_template = wmma_intrinsic_template
+ instruction_template = wmma_instruction_template
+
test_params = params
test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
test_params["function"] = test_params["intrinsic"].replace(".", "_")
@@ -458,55 +487,68 @@ def gen_wmma_mma_tests():
# Generate set of checks to verify that that we did generate sensible set of
# tests for the given combination of PTX and SM variants.
#
-# PTX<N>: verifies that we did generate tests for correct classes of intrinsics.
-# PTX<N>U: verifies that we did not generate intrinsics unsupported by
-# the PTX version.
-# SM<N>: verifies that we did generate correct classes of instructions for the SM.
-# SM<N>U: verifies that we did not generate instructions unsupported by the SM
-#
-# Note that SM/PTX constraints overlap, but DAG checks do not allow overlapping
-# matches. We implicitly rely that we generate multiple variants of most of the
-# instructions and usually have enough input data to find more than one match of
-# the same kind, if necessary. When it's not possible (e.g. there's only one
-# m8n8k128.mma.row.col.b1), we may need to match PTX instruction instead.
def gen_check_unsupported_ops(items):
print("; Complete list of intrinsics supported by PTX%d on sm_%d"
% (ptx_version, gpu_arch))
print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}")
print("""
-; PTX60-DAG: m16n16k16.load.{{[ab].*}}.f16.p
-; PTX60-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
-; PTX60U-NOT: m32n8k16
-; PTX60U-NOT: m8n32k16
-; PTX60U-NOT: .{{s32|s[48]|u[48]|b1}}
-
-; All features of PTX60, plus m32n8k16/m8n32k16 geometries.
-; PTX61-DAG: m32n8k16.load.{{[ab].*}}.f16.p
-; PTX61-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
-; PTX61-DAG: m8n32k16.load.{{[ab].*}}.f16.p
-; PTX61-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
-; PTX61U-NOT: .{{s32|s[48]|u[48]|b1}}
-
-; SM70U-NOT: .{{s32|s[48]|u[48]|b1}}
-
-; PTX63 supports all features of PTX60+PTX61, plus support for integers.
-; Alas we can"t just use PTX<N> checks for that as available instructions
-; depend on SM integers need sm72+ and subinteger ops need sm75, so we
-; transition to SM<N> checks
-; SM72-DAG: m16n16k16.load.{{[ab].*}}.s8.p
-; SM72-DAG: m8n32k16.load.{{[ab].*}}.s8.p
-; SM72-DAG: m32n8k16.load.{{[ab].*}}.s8.p
-; SM72-DAG: m16n16k16.load.{{[ab].*}}.u8.p
-; SM72-DAG: m8n32k16.load.{{[ab].*}}.u8.p
-; SM72-DAG: m32n8k16.load.{{[ab].*}}.u8.p
-; SM72-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p
-; SM72U-NOT: .{{s4|u4|b1}}
-
-; SM75-DAG: m8n8k128.load.{{[ab].*}}.b1.p
-; SM75-DAG: m8n8k32.load.{{[ab].*}}.s4.p
-; SM75-DAG: m8n8k32.load.{{[ab].*}}.u4.p
-; SM75-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p
-; SM75-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p
+
+; NOEXTGEOM-NOT: {{m8n32|m32n8}}
+; NOINT-NOT: .{{s32|s8}}
+; NOSUBINT-NOT: {{s4|u4|b1}}
+; NOMMA-NOT: .m8n8k4.
+
+; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p
+; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
+; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32
+; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16
+; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16
+; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32
+
+; PTX60 adds support for m32n8k16/m8n32k16 geometries.
+; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p
+; EXTGEOM-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
+; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32
+; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16
+; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16
+; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32
+
+; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p
+; EXTGEOM-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
+; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32
+; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16
+; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16
+; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32
+
+; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p
+; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p
+; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p
+; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p
+; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p
+; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p
+; INT-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p
+; INT-DAG: m16n16k16.mma.{{.*}}.u8
+; INT-DAG: m16n16k16.mma.{{.*}}.s8
+; INT-DAG: m8n32k16.mma.{{.*}}.u8
+; INT-DAG: m8n32k16.mma.{{.*}}.s8
+; INT-DAG: m32n8k16.mma.{{.*}}.u8
+; INT-DAG: m32n8k16.mma.{{.*}}.s8
+
+; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p
+; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p
+; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p
+; SUBINT-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p
+; SUBINT-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p
+; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4
+; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4
+; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1
+
+; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32
+; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16
+; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16
+; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32
+;
+
""")
print("; INTRINSICS_LIST_BEGIN")
More information about the llvm-commits
mailing list