[llvm] [AMDGPU] Allow bank conflicts on src0 for V_DUAL_MOV_B32 for gfx1170 (PR #186100)
Mirko BrkuĊĦanin via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 12 04:47:16 PDT 2026
https://github.com/mbrkusanin created https://github.com/llvm/llvm-project/pull/186100
None
>From 78f075d11ead1c46f82762b5443732ef23304f02 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Thu, 12 Mar 2026 12:43:56 +0100
Subject: [PATCH] [AMDGPU] Allow bank conflicts on src0 for V_DUAL_MOV_B32 for
gfx1170
---
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 3 +-
llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp | 4 +-
llvm/test/CodeGen/AMDGPU/vopd-combine.mir | 350 +++++++++----
.../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll | 486 ++++++------------
llvm/test/MC/AMDGPU/gfx1170_asm_features.s | 8 +
5 files changed, 397 insertions(+), 454 deletions(-)
create mode 100644 llvm/test/MC/AMDGPU/gfx1170_asm_features.s
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index bbf1e2be86950..34c9ba58fd6b7 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3967,9 +3967,10 @@ AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) {
: MCRegister();
};
- // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
+ // On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
// source-cache.
bool SkipSrc =
+ Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx13 ||
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 800d5bfa2314f..b17cabf37d53f 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -142,9 +142,9 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
return false;
- // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
+ // On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
// source-cache.
- bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
+ bool SkipSrc = (ST.hasGFX11_7Insts() || ST.hasGFX12Insts()) &&
MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
bool AllowSameVGPR = ST.hasGFX1250Insts();
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index 3a2b0996edacf..b5c25f862ba29 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1100 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1170 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX12 %s
@@ -43,12 +45,19 @@ body: |
; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_schedule
- ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_schedule
+ ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_schedule
+ ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_schedule
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@@ -80,12 +89,19 @@ body: |
; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_fmamk
- ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_fmamk
+ ; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_fmamk
+ ; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_fmamk
; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
@@ -155,19 +171,33 @@ body: |
; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
;
- ; PAIR-GFX11-LABEL: name: vopd_cndmask
- ; PAIR-GFX11: liveins: $vcc_lo
- ; PAIR-GFX11-NEXT: {{ $}}
- ; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $sgpr20 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
+ ; PAIR-GFX1100-LABEL: name: vopd_cndmask
+ ; PAIR-GFX1100: liveins: $vcc_lo
+ ; PAIR-GFX1100-NEXT: {{ $}}
+ ; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $sgpr20 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_cndmask
+ ; PAIR-GFX1170: liveins: $vcc_lo
+ ; PAIR-GFX1170-NEXT: {{ $}}
+ ; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $sgpr20 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx1170 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
;
; PAIR-GFX12-LABEL: name: vopd_cndmask
; PAIR-GFX12: liveins: $vcc_lo
@@ -211,10 +241,15 @@ body: |
; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_mov
- ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_mov
+ ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_mov
+ ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@@ -239,10 +274,15 @@ body: |
; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_mov_mov
- ; PAIR-GFX11: $sgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $sgpr7 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_mov_mov
+ ; PAIR-GFX1100: $sgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $sgpr7 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_mov_mov
+ ; PAIR-GFX1170: $sgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $sgpr7 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_mov
; PAIR-GFX12: $sgpr0 = IMPLICIT_DEF
@@ -300,12 +340,19 @@ body: |
; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_constants_inlinable
- ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_constants_inlinable
+ ; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_constants_inlinable
+ ; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_constants_inlinable
; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
@@ -338,12 +385,19 @@ body: |
; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_constants_same
- ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_constants_same
+ ; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_constants_same
+ ; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_constants_same
; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
@@ -373,10 +427,15 @@ body: |
; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_mov_fmaak_constants_same
- ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $sgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_mov_fmaak_constants_same
+ ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $sgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_mov_fmaak_constants_same
+ ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $sgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx1170 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_fmaak_constants_same
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@@ -403,11 +462,17 @@ body: |
; SCHED-NEXT: DBG_VALUE $vgpr0, 0, 0
; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_debug
- ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: DBG_VALUE $vgpr0, 0, 0
+ ; PAIR-GFX1100-LABEL: name: vopd_debug
+ ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: DBG_VALUE $vgpr0, 0, 0
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_debug
+ ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: DBG_VALUE $vgpr0, 0, 0
;
; PAIR-GFX12-LABEL: name: vopd_debug
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@@ -451,23 +516,41 @@ body: |
; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained
- ; PAIR-GFX11: liveins: $vcc_lo
- ; PAIR-GFX11-NEXT: {{ $}}
- ; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained
+ ; PAIR-GFX1100: liveins: $vcc_lo
+ ; PAIR-GFX1100-NEXT: {{ $}}
+ ; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_schedule_unconstrained
+ ; PAIR-GFX1170: liveins: $vcc_lo
+ ; PAIR-GFX1170-NEXT: {{ $}}
+ ; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx1170 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained
; PAIR-GFX12: liveins: $vcc_lo
@@ -551,32 +634,59 @@ body: |
; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained_2
- ; PAIR-GFX11: liveins: $vcc_lo
- ; PAIR-GFX11-NEXT: {{ $}}
- ; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr20 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained_2
+ ; PAIR-GFX1100: liveins: $vcc_lo
+ ; PAIR-GFX1100-NEXT: {{ $}}
+ ; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr20 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+ ; PAIR-GFX1100-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_schedule_unconstrained_2
+ ; PAIR-GFX1170: liveins: $vcc_lo
+ ; PAIR-GFX1170-NEXT: {{ $}}
+ ; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr20 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx1170 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx1170 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx1170 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
+ ; PAIR-GFX1170-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained_2
; PAIR-GFX12: liveins: $vcc_lo
@@ -657,11 +767,17 @@ body: |
; SCHED-NEXT: $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_mov_fixup
- ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_mov_fixup
+ ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_mov_fixup
+ ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1170-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_fixup
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@@ -726,11 +842,16 @@ body: |
; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_mov_mov_same_src_bank
- ; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr5 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_mov_mov_same_src_bank
+ ; PAIR-GFX1100: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr5 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+ ; PAIR-GFX1100-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_mov_mov_same_src_bank
+ ; PAIR-GFX1170: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr5 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 killed $vgpr1, killed $vgpr5, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_mov_same_src_bank
; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF
@@ -754,10 +875,15 @@ body: |
; SCHED-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
; SCHED-NEXT: $vgpr1 = V_ADD_F32_e32 killed $vgpr3, $vgpr3, implicit $mode, implicit $exec
;
- ; PAIR-GFX11-LABEL: name: vopd_combine_opy_overwrites_opx
- ; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
- ; PAIR-GFX11-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; PAIR-GFX1100-LABEL: name: vopd_combine_opy_overwrites_opx
+ ; PAIR-GFX1100: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1100-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ;
+ ; PAIR-GFX1170-LABEL: name: vopd_combine_opy_overwrites_opx
+ ; PAIR-GFX1170: $vgpr1 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; PAIR-GFX1170-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_combine_opy_overwrites_opx
; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
index 2558dc3903640..7148f3d614650 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -17,37 +17,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v11, v10
-; GFX1170-NEXT: v_mov_b32_e32 v12, v10
-; GFX1170-NEXT: v_mov_b32_e32 v13, v10
-; GFX1170-NEXT: v_mov_b32_e32 v14, v10
-; GFX1170-NEXT: v_mov_b32_e32 v15, v10
-; GFX1170-NEXT: v_mov_b32_e32 v16, v10
-; GFX1170-NEXT: v_mov_b32_e32 v17, v10
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
-; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
-; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
-; GFX12-NEXT: v_mov_b32_e32 v17, v10
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v10, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GCN-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GCN-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GCN-NEXT: v_mov_b32_e32 v17, v10
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -69,37 +51,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v11, v10
-; GFX1170-NEXT: v_mov_b32_e32 v12, v10
-; GFX1170-NEXT: v_mov_b32_e32 v13, v10
-; GFX1170-NEXT: v_mov_b32_e32 v14, v10
-; GFX1170-NEXT: v_mov_b32_e32 v15, v10
-; GFX1170-NEXT: v_mov_b32_e32 v16, v10
-; GFX1170-NEXT: v_mov_b32_e32 v17, v10
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
-; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
-; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
-; GFX12-NEXT: v_mov_b32_e32 v17, v10
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v10, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GCN-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GCN-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GCN-NEXT: v_mov_b32_e32 v17, v10
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -119,26 +83,15 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v10, 0x42004200
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v11, v10
-; GFX1170-NEXT: v_mov_b32_e32 v12, v10
-; GFX1170-NEXT: v_mov_b32_e32 v13, v10
-; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
-; GFX12-NEXT: v_mov_b32_e32 v13, v10
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v10, 0x42004200
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
@@ -146,26 +99,15 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3f803f80
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v11, v10
-; GFX1170-NEXT: v_mov_b32_e32 v12, v10
-; GFX1170-NEXT: v_mov_b32_e32 v13, v10
-; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v10, 0x3f803f80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
-; GFX12-NEXT: v_mov_b32_e32 v13, v10
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v10, 0x3f803f80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -173,26 +115,15 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v11, v10
-; GFX1170-NEXT: v_mov_b32_e32 v12, v10
-; GFX1170-NEXT: v_mov_b32_e32 v13, v10
-; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
-; GFX12-NEXT: v_mov_b32_e32 v13, v10
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -214,37 +145,19 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v7, v6
-; GFX1170-NEXT: v_mov_b32_e32 v8, v6
-; GFX1170-NEXT: v_mov_b32_e32 v9, v6
-; GFX1170-NEXT: v_mov_b32_e32 v10, v6
-; GFX1170-NEXT: v_mov_b32_e32 v11, v6
-; GFX1170-NEXT: v_mov_b32_e32 v12, v6
-; GFX1170-NEXT: v_mov_b32_e32 v13, v6
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
-; GFX12-NEXT: v_mov_b32_e32 v13, v6
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v13, v6
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -266,37 +179,19 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v4, 0x80
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v5, v4
-; GFX1170-NEXT: v_mov_b32_e32 v6, v4
-; GFX1170-NEXT: v_mov_b32_e32 v7, v4
-; GFX1170-NEXT: v_mov_b32_e32 v8, v4
-; GFX1170-NEXT: v_mov_b32_e32 v9, v4
-; GFX1170-NEXT: v_mov_b32_e32 v10, v4
-; GFX1170-NEXT: v_mov_b32_e32 v11, v4
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX1170-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
-; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
-; GFX12-NEXT: v_mov_b32_e32 v11, v4
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GCN-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GCN-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
+; GCN-NEXT: v_mov_b32_e32 v11, v4
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -318,37 +213,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v7, v6
-; GFX1170-NEXT: v_mov_b32_e32 v8, v6
-; GFX1170-NEXT: v_mov_b32_e32 v9, v6
-; GFX1170-NEXT: v_mov_b32_e32 v10, v6
-; GFX1170-NEXT: v_mov_b32_e32 v11, v6
-; GFX1170-NEXT: v_mov_b32_e32 v12, v6
-; GFX1170-NEXT: v_mov_b32_e32 v13, v6
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
-; GFX12-NEXT: v_mov_b32_e32 v13, v6
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v13, v6
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -370,37 +247,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v7, v6
-; GFX1170-NEXT: v_mov_b32_e32 v8, v6
-; GFX1170-NEXT: v_mov_b32_e32 v9, v6
-; GFX1170-NEXT: v_mov_b32_e32 v10, v6
-; GFX1170-NEXT: v_mov_b32_e32 v11, v6
-; GFX1170-NEXT: v_mov_b32_e32 v12, v6
-; GFX1170-NEXT: v_mov_b32_e32 v13, v6
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
-; GFX12-NEXT: v_mov_b32_e32 v13, v6
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v13, v6
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -422,37 +281,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v7, v6
-; GFX1170-NEXT: v_mov_b32_e32 v8, v6
-; GFX1170-NEXT: v_mov_b32_e32 v9, v6
-; GFX1170-NEXT: v_mov_b32_e32 v10, v6
-; GFX1170-NEXT: v_mov_b32_e32 v11, v6
-; GFX1170-NEXT: v_mov_b32_e32 v12, v6
-; GFX1170-NEXT: v_mov_b32_e32 v13, v6
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
-; GFX12-NEXT: v_mov_b32_e32 v13, v6
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v13, v6
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -474,37 +315,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v7, v6
-; GFX1170-NEXT: v_mov_b32_e32 v8, v6
-; GFX1170-NEXT: v_mov_b32_e32 v9, v6
-; GFX1170-NEXT: v_mov_b32_e32 v10, v6
-; GFX1170-NEXT: v_mov_b32_e32 v11, v6
-; GFX1170-NEXT: v_mov_b32_e32 v12, v6
-; GFX1170-NEXT: v_mov_b32_e32 v13, v6
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
-; GFX12-NEXT: v_mov_b32_e32 v13, v6
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v13, v6
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -526,37 +349,19 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX1170-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX1170: ; %bb.0: ; %bb
-; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_mov_b32_e32 v7, v6
-; GFX1170-NEXT: v_mov_b32_e32 v8, v6
-; GFX1170-NEXT: v_mov_b32_e32 v9, v6
-; GFX1170-NEXT: v_mov_b32_e32 v10, v6
-; GFX1170-NEXT: v_mov_b32_e32 v11, v6
-; GFX1170-NEXT: v_mov_b32_e32 v12, v6
-; GFX1170-NEXT: v_mov_b32_e32 v13, v6
-; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1170-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX1170-NEXT: s_clause 0x1
-; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX1170-NEXT: s_endpgm
-;
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
-; GFX12-NEXT: v_mov_b32_e32 v13, v6
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v13, v6
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -574,3 +379,6 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32>
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_features.s b/llvm/test/MC/AMDGPU/gfx1170_asm_features.s
new file mode 100644
index 0000000000000..3cc0cc9b74cf8
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1170_asm_features.s
@@ -0,0 +1,8 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck -check-prefix=GFX1170 %s
+
+//===----------------------------------------------------------------------===//
+// A VOPD OpY mov_b32 instruction uses SRC2 source-cache if OpX is also mov_b32
+//===----------------------------------------------------------------------===//
+
+v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v1
+// GFX1170: encoding: [0x05,0x01,0x10,0xca,0x01,0x01,0x02,0x02]
More information about the llvm-commits
mailing list