[llvm-branch-commits] [llvm] AMDGPU: Fix not folding splat immediate into VGPR MFMA src2 (PR #150628)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jul 25 07:52:47 PDT 2025
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/150628
None
>From 3c67b08d7533ed3977c5473aaebf0c281f3d7907 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 25 Jul 2025 23:21:34 +0900
Subject: [PATCH] AMDGPU: Fix not folding splat immediate into VGPR MFMA src2
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 4 +
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 162 ++++++------------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 89 ++--------
3 files changed, 70 insertions(+), 185 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e5d1eaad2b8f4..b77da4d612dd4 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
switch (OpTy) {
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
break;
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
break;
default:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index adbc2df4b3474..7d5bc3f4c0c39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1083,58 +1083,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX90A-VGPR: ; %bb.0: ; %bb
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
-; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-VGPR-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double 1.0), i32 0, i32 0, i32 0)
@@ -1184,58 +1162,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX90A-VGPR: ; %bb.0: ; %bb
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0xbff00000
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
-; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-VGPR-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0xbff00000
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double -1.0), i32 0, i32 0, i32 0)
@@ -1285,58 +1241,36 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX90A-VGPR: ; %bb.0: ; %bb
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 64
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: s_nop 7
-; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-VGPR-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 64 to double)), i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index d7257a49b00bb..b4a6451908a6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -3238,27 +3238,11 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v17, v18, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
@@ -3463,13 +3447,10 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 3
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -4483,13 +4464,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v5, v[0:3]
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
@@ -4627,25 +4605,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v17, v[0:15]
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 0
@@ -4797,36 +4760,20 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0x3c003c00
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v16
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
More information about the llvm-branch-commits
mailing list