[llvm-branch-commits] [llvm] [AMDGPU] Intrinsic and codegen for wmma_f64_16x16x4_f64 (PR #203143)

Wed Jun 10 18:47:32 PDT 2026

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/203143

>From d67eca21098240fbf656543bd3d30e120c674c14 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 10 Jun 2026 17:13:30 -0700
Subject: [PATCH] [AMDGPU] Intrinsic and codegen for wmma_f64_16x16x4_f64

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   6 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  23 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   1 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   4 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |   7 +
 .../AMDGPU/llvm.amdgcn.wmma.gfx1251.w32.ll    |  61 ++++++++
 .../llvm.amdgcn.wmma.imm.gfx1251.w32.ll       | 144 +++++++++++++++++
 .../llvm.amdgcn.wmma.imod.gfx1251.w32.ll      | 145 ++++++++++++++++++
 8 files changed, 390 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1251.w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1251.w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1251.w32.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f4f0fa0d5b1bb..f1659f0cd803a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -4054,8 +4054,11 @@ class AMDGPUWmmaIntrinsicModsAll<LLVMType AB, LLVMType CD> :
       LLVMMatchType<1>, // %B
       llvm_i16_ty,      // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
       LLVMMatchType<0>,               // %C
+      llvm_i1_ty,        // matrix_a_reuse
+      llvm_i1_ty,        // matrix_b_reuse
     ],
-    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
+     ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 // D and C are of different types.
@@ -4135,6 +4138,7 @@ class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
 >;
 
 defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
+def int_amdgcn_wmma_f64_16x16x4_f64       : AMDGPUWmmaIntrinsicModsAll<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x4_f32       : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x32_bf16     : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x32_f16      : AMDGPUWmmaIntrinsicModsC<llvm_anyfloat_ty, llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 981d880bcf7b0..b545ec82ac8b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4086,6 +4086,29 @@ bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
         }
     }
 
+  // Currently f64 immediate vectors are represented as vectors of v2i32, with
+  // different lo and hi 32-bit values even though double values are splated.
+  // So we have to manually compare to determine whether it is splated.
+  if (CurDAG->isConstantIntBuildVectorOrConstantInt(SplatSrc32)) {
+    int64_t Imm64 = 0;
+    for (unsigned i = 0; i < SplatSrc32->getNumOperands(); i += 2) {
+      auto Lo32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i));
+      auto Hi32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i + 1));
+      int64_t LoImm = Lo32->getAPIntValue().getSExtValue();
+      int64_t HiImm = Hi32->getAPIntValue().getSExtValue();
+      int64_t Imm64I = (HiImm << 32) + LoImm;
+      if (i == 0) {
+        if (!isInlineImmediate(APInt(64, Imm64I)))
+          return false;
+        Imm64 = Imm64I;
+      } else if (Imm64I != Imm64)
+        return false;
+    } // end for
+
+    Src = CurDAG->getTargetConstant(Imm64, SDLoc(In), MVT::i64);
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index cc1f2d0664484..db8fc149df1f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4826,6 +4826,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_f64_16x16x4_f64:
     case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
     case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
     case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index add70e1d62222..b5798ec0c7078 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -2256,6 +2256,10 @@ let SubtargetPredicate = isGFX125xOnly in {
   def : SWMMACPat<V_SWMMAC_F32_16X16X64_F16_w32_twoaddr,      int_amdgcn_swmmac_f32_16x16x64_f16,      F32_F16X64_SWMMAC_w32>;
   def : SWMMACPat<V_SWMMAC_F16_16X16X64_F16_w32_twoaddr,      int_amdgcn_swmmac_f16_16x16x64_f16,      F16_F16X64_SWMMAC_w32>;
 } // End SubtargetPredicate = isGFX125xOnly
+
+let SubtargetPredicate = HasGFX1251GEMMInsts in {
+  defm : WMMAPat<"V_WMMA_F64_16X16X4_F64_w32",  int_amdgcn_wmma_f64_16x16x4_f64,  F64_F64X4_WMMA_w32>;
+} // End SubtargetPredicate = HasGFX1251GEMMInsts
 } // End WaveSizePredicate = isWave32
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 9c57f1f2e5367..73d09a9dad37f 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -204,6 +204,13 @@ bb:
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 false, <2 x double> %A, i1 false, <2 x double> %B, i16 0, <8 x double> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f64_16x16x4_f64(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> %C, i1 false, i1 false)
+  store <8 x double> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
 define amdgpu_kernel void @wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
   %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1251.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1251.w32.ll
new file mode 100644
index 0000000000000..9b4d2df8148fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1251.w32.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck %s --check-prefix=GFX1251
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 -global-isel < %s | FileCheck %s --check-prefix=GISEL
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_matrix_a_reuse(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_matrix_a_reuse:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] matrix_a_reuse
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_matrix_a_reuse:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] matrix_a_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> %C, i1 true, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_matrix_b_reuse(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_matrix_b_reuse:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] matrix_b_reuse
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_matrix_b_reuse:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> %C, i1 false, i1 true)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1, <2 x double>, i1, <2 x double>, i16, <8 x double>, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1251.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1251.w32.ll
new file mode 100644
index 0000000000000..f654d63b70a63
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1251.w32.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck %s --check-prefix=GFX1251
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 -global-isel < %s | FileCheck %s --check-prefix=GISEL
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64(<2 x double> %A, <2 x double> %B, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[10:25], v[0:3], v[4:7], 1.0
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[22:25], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[18:21], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[10:25], v[0:3], v[4:7], 1.0
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GISEL-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GISEL-NEXT:    global_store_b128 v[8:9], v[18:21], off offset:32
+; GISEL-NEXT:    global_store_b128 v[8:9], v[22:25], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_non_splat(<2 x double> %A, <2 x double> %B, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_non_splat:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0x3ff00000
+; GFX1251-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1251-NEXT:    v_dual_mov_b32 v15, 2.0 :: v_dual_mov_b32 v12, v10
+; GFX1251-NEXT:    v_dual_mov_b32 v13, v11 :: v_dual_mov_b32 v14, v10
+; GFX1251-NEXT:    v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v17, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v19, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v20, v10 :: v_dual_mov_b32 v21, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v22, v10 :: v_dual_mov_b32 v23, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v25, v11
+; GFX1251-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[10:25], v[0:3], v[4:7], v[10:25]
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[22:25], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[18:21], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    s_mov_b64 s[0:1], 1.0
+; GISEL-NEXT:    s_mov_b64 s[4:5], 2.0
+; GISEL-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[10:25], v[0:3], v[4:7], v[10:25]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GISEL-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GISEL-NEXT:    global_store_b128 v[8:9], v[18:21], off offset:32
+; GISEL-NEXT:    global_store_b128 v[8:9], v[22:25], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> <double 1.0, double 1.0, double 2.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_non_inlineable(<2 x double> %A, <2 x double> %B, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_non_inlineable:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0x40080000
+; GFX1251-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT:    v_dual_mov_b32 v12, v10 :: v_dual_mov_b32 v13, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v15, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v17, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v19, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v20, v10 :: v_dual_mov_b32 v21, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v22, v10 :: v_dual_mov_b32 v23, v11
+; GFX1251-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v25, v11
+; GFX1251-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[10:25], v[0:3], v[4:7], v[10:25]
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[22:25], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[18:21], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    s_mov_b64 s[0:1], 0x4008000000000000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[10:25], v[0:3], v[4:7], v[10:25]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GISEL-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GISEL-NEXT:    global_store_b128 v[8:9], v[18:21], off offset:32
+; GISEL-NEXT:    global_store_b128 v[8:9], v[22:25], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> <double 3.0, double 3.0, double 3.0, double 3.0, double 3.0, double 3.0, double 3.0, double 3.0>, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1, <2 x double>, i1, <2 x double>, i16, <8 x double>, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1251.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1251.w32.ll
new file mode 100644
index 0000000000000..fcc3c98e78534
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1251.w32.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck %s --check-prefix=GFX1251
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 -global-isel < %s | FileCheck %s --check-prefix=GISEL
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_negA(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_negA:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[1,0,0]
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[1,0,0]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 1, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> %C, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_negB(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_negB:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[0,1,0]
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[0,1,0]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 1, <2 x double> %B, i16 0, <8 x double> %C, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_negC(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_negC:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[0,0,1]
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 1, <8 x double> %C, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_neg_absC(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_neg_absC:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 3, <8 x double> %C, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f64_16x16x4_f64_ignoreC(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) {
+; GFX1251-LABEL: test_wmma_f64_16x16x4_f64_ignoreC:
+; GFX1251:       ; %bb.0: ; %bb
+; GFX1251-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23]
+; GFX1251-NEXT:    s_clause 0x3
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GFX1251-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GFX1251-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f64_16x16x4_f64_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT:    v_wmma_f64_16x16x4_f64 v[8:23], v[0:3], v[4:7], v[8:23]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[8:11], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[12:15], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 4, <8 x double> %C, i1 false, i1 false)
+  store <8 x double> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1, <2 x double>, i1, <2 x double>, i16, <8 x double>, i1, i1)