[llvm] [AMDGPU][wmma] - Add tied wmma intrinsic (PR #69903)

Tue Oct 24 07:33:55 PDT 2023

https://github.com/OutOfCache updated https://github.com/llvm/llvm-project/pull/69903

>From 9430708a7231c3e7df17337bbf75d69cfb87c74f Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Tue, 5 Sep 2023 07:16:39 +0200
Subject: [PATCH 1/3] [AMDGPU/VOP3P] - Simplify wmma instruction defs

Instead of checking for the `_w32` or `_w64` suffix,
we can use the suffix in the definitions directly.
---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 05e68f46b32605d..e68c82d40c28eaa 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -871,29 +871,16 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
-  if !eq(Suffix, "_w32") then {
     let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
       let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
-        def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+        def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
       }
       let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+        def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
       }
     }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
-                            !cast<Instruction>(NAME # _threeaddr_w32)>;
-  } else if !eq(Suffix, "_w64") then {
-    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
-        def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
-      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
-    }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
-                            !cast<Instruction>(NAME # _threeaddr_w64)>;
-  }
+    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+                            !cast<Instruction>(NAME # _threeaddr # Suffix)>;
 
   if !eq(Type, WMMAOpSel) then {
     def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;

>From 9eb7f8d9d3a9eeb82754665db96840fb76b138c8 Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Tue, 5 Sep 2023 16:26:41 +0200
Subject: [PATCH 2/3] [AMDGPU/VOP3P] - Add tied wmma_f16 pseudos

Add intrinsics for `wmma_f16` and `wmma_bf16`, which stay as two-address
instructions.

This is a requirement for a future optimization
regarding wmma instructions.
The new changes make use of the `op_sel` argument of `wmma` instructions
to read from the upper halves of the input
accumulator and write to the upper halves of the output matrix.
With two-address instructions, we can guarantee that the content
of the upper halves is the same as the input
accumulator.
With three-address instructions, the output
registers do not copy the content of the input
registers. Instead, the upper halves
remain unchanged from their previous values.
This can cause issues if there are unexpected
values remaining in these registers.

For example:
```
v_wmma_f16_16x16x16_f16 v[0:7],   ..., v[24:31]
v_wmma_f16_16x16x16_f16 v[32:30], ..., v[24:31]
```
After these two instructions run, there is
no guarantee that the content of bits 16-31 of
`v[0:7]` are the same as the ones from `v[24:31]`.
If we have another instruction like the following:
```
v_wmma_f16_16x16x16_f16 v[0:7], v[24:31], v[32:49], v[0:7] op_sel:[0,0,1]
```
We read from the upper halves of `v[0:7]`, but
the content is not necessarily correct.

For our purpose, we create new pseudo instructions, while maintaining
the behavior of the original instructions.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   2 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  44 ++++----
 .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll  | 100 ++++++++++++++++++
 .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll  |  84 +++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll     | 100 ++++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll     |  84 +++++++++++++++
 7 files changed, 398 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 5f1d1d932f74cbd..b1f2b512628bb0d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2363,6 +2363,8 @@ def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llv
 def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_tied_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_tied_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..f4f9aa5903c458c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4279,6 +4279,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_sudot8:
     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_tied_bf16_16x16x16_bf16:
+    case Intrinsic::amdgcn_wmma_tied_f16_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index e68c82d40c28eaa..c406e57f9de2654 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -865,22 +865,26 @@ def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
 //    it converts the default pseudo to the pseudo where src2 is not the same as vdst.
 // 3) @earlyclobber on the destination satisfies the constraint during RA.
 
-multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
+multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type, bit convertibleTo3Addr> {
 
   defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
     let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
         def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
       }
-      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
     }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+    if !eq(convertibleTo3Addr, 1) then {
+      let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+        let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+          def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+        }
+      }
+      def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
                             !cast<Instruction>(NAME # _threeaddr # Suffix)>;
+    }
 
   if !eq(Type, WMMAOpSel) then {
     def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
@@ -893,21 +897,25 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
 
 
 let WaveSizePredicate = isWave32 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
-  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_TIED_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp, 1>;
 }
 
 let WaveSizePredicate = isWave64 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
-  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_TIED_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
 
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 6ca2dd838d37ac9..1cc1c6d7d46e29d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -4,7 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -78,6 +80,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
@@ -112,6 +163,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index a18d0a569bfb6ef..66655a3f2d16c98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -4,7 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -70,6 +72,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
@@ -100,6 +143,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index 464c374f638c270..d0f6e592bfbf473 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -4,7 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -78,6 +80,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
@@ -112,6 +163,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 7b1e29c18c723f0..c97fafd3aaa3c63 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -4,7 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -70,6 +72,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
@@ -100,6 +143,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {

>From 1e1309a5725ccb017dab02563ce3951fe1053d0a Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Tue, 24 Oct 2023 16:33:08 +0200
Subject: [PATCH 3/3] fixup! [AMDGPU/VOP3P] - Add tied wmma_f16 pseudos

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td             |  4 ++--
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp    |  4 ++--
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td          |  8 ++++----
 .../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll | 12 ++++++------
 .../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll | 12 ++++++------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll      | 12 ++++++------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll      | 12 ++++++------
 7 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b1f2b512628bb0d..80220c241c2d3f5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2363,8 +2363,8 @@ def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llv
 def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_tied_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_tied_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f4f9aa5903c458c..e409a24007a6b22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4279,8 +4279,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_sudot8:
     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
-    case Intrinsic::amdgcn_wmma_tied_bf16_16x16x16_bf16:
-    case Intrinsic::amdgcn_wmma_tied_f16_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
+    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c406e57f9de2654..a6c431bae704c1f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -901,8 +901,8 @@ let WaveSizePredicate = isWave32 in {
   defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
   defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
   defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
-  defm V_WMMA_TIED_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>;
-  defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_F16_16X16X16_F16_TIED   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
   defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
   defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp, 1>;
 }
@@ -912,8 +912,8 @@ let WaveSizePredicate = isWave64 in {
   defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
   defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
   defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
-  defm V_WMMA_TIED_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>;
-  defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_F16_16X16X16_F16_TIED   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
   defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
   defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 1cc1c6d7d46e29d..2ddf367f0aafc20 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -4,9 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
-declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
-declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -122,8 +122,8 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
-  %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
   store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
   store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
   ret void
@@ -205,8 +205,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
-  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
   store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
   store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index 66655a3f2d16c98..09702ae5cb608fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -4,9 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
-declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
-declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -106,8 +106,8 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
-  %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
   store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
   store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
   ret void
@@ -177,8 +177,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
-  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
   store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
   store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index d0f6e592bfbf473..5076fda60a4749b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -4,9 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
-declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
-declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -122,8 +122,8 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
-  %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
   store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
   store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
   ret void
@@ -205,8 +205,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
-  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
   store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
   store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index c97fafd3aaa3c63..e47a91b4fc0bac1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -4,9 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
-declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
-declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -106,8 +106,8 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
-  %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
   store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
   store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
   ret void
@@ -177,8 +177,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
-  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
   store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
   store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
   ret void