[llvm] dd1d41f - [AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 (#132089)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 1 09:40:21 PDT 2025
Author: Brox Chen
Date: 2025-04-01T12:40:18-04:00
New Revision: dd1d41f833c9b28d8a940ba5a0b85b0d47e44e43
URL: https://github.com/llvm/llvm-project/commit/dd1d41f833c9b28d8a940ba5a0b85b0d47e44e43
DIFF: https://github.com/llvm/llvm-project/commit/dd1d41f833c9b28d8a940ba5a0b85b0d47e44e43.diff
LOG: [AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 (#132089)
There are V2S copies between vpgr16 and spgr32 in true16 mode. This is
caused by vgpr16 and sgpr32 both selectable by 16bit src in ISel.
When a V2S copy and its useMI are lowered to VALU, this patch check
1. If the generated new VALU is used by a true16 inst. Add subreg access
if necessary.
2. Legalize the V2S copy by replacing it to subreg_to_reg
an example MIR looks like:
```
%2:sgpr_32 = COPY %1:vgpr_16
%3:sgpr_32 = S_OR_B32 %2:sgpr_32, ...
%4:vgpr_16 = V_ADD_F16_t16 %3:sgpr_32, ...
```
currently lowered to
```
%2:vgpr_32 = COPY %1:vgpr_16
%3:vgpr_32 = V_OR_B32 %2:vgpr_32, ...
%4:vgpr_16 = V_ADD_F16_t16 %3:vgpr_32, ...
```
after this patch
```
%2:vgpr_32 = SUBREG_TO_REG 0, %1:vgpr_16, lo16
%3:vgpr_32 = V_OR_B32 %2:vgpr_32, ...
%4:vgpr_16 = V_ADD_F16_t16 %3.lo16:vgpr_32, ...
```
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
llvm/test/CodeGen/AMDGPU/select.f16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4acbc201ec58e..260f80a5f532e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7744,6 +7744,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
return;
}
+
+ // If this is a v2s copy src from vgpr16 to sgpr32,
+ // replace vgpr copy to subreg_to_reg
+ if (ST.useRealTrue16Insts() && Inst.isCopy() &&
+ Inst.getOperand(1).getReg().isVirtual() &&
+ RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
+ const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
+ if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
+ 32 == RI.getRegSizeInBits(*NewDstRC)) {
+ Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(TargetOpcode::SUBREG_TO_REG), NewDstReg)
+ .add(MachineOperand::CreateImm(0))
+ .add(Inst.getOperand(1))
+ .add(MachineOperand::CreateImm(AMDGPU::lo16));
+ Inst.eraseFromParent();
+
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ return;
+ }
+ }
+
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
legalizeOperands(Inst, MDT);
@@ -7837,6 +7860,22 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
assert(NewDstRC);
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
+
+ // Check useMI of NewInstr. If used by a true16 instruction,
+ // add a lo16 subreg access if size mismatched
+ if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+ E = MRI.use_end();
+ I != E; ++I) {
+ MachineInstr &UseMI = *I->getParent();
+ unsigned UseMIOpcode = UseMI.getOpcode();
+ if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
+ (16 ==
+ RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
+ I->setSubReg(AMDGPU::lo16);
+ }
+ }
+ }
}
fixImplicitOperands(*NewInstr);
// Legalize the operands
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 466f28805dfcf..419f57972a485 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -1,41 +1,35 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
-# XFAIL: *
-# FIXME-TRUE16 reenable after fix-sgpr-copies is updated for true16 flow
---
-name: cmp_f16
+name: cvt_hi_f32_f16
body: |
- bb.0.entry:
- ; GCN-LABEL: name: cmp_f16
+ bb.0:
+ ; GCN-LABEL: name: cvt_hi_f32_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY killed [[COPY]]
- ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY1]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+ ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[SUBREG_TO_REG]]
+ ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY]].hi16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_16 = IMPLICIT_DEF
- %1:sreg_32 = IMPLICIT_DEF
- %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
- %3:sreg_32 = COPY %2:vgpr_16
- nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
- %4:sreg_32_xm0_xexec = COPY $scc
- %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+ %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+ %2:sreg_32 = COPY %1:vgpr_16
+ %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
...
---
-name: cvt_hi_f32_f16
+name: s_or_b32
body: |
bb.0:
- ; GCN-LABEL: name: cvt_hi_f32_f16
+ ; GCN-LABEL: name: s_or_b32
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
+ ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec
+ ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_16 = IMPLICIT_DEF
%1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
%2:sreg_32 = COPY %1:vgpr_16
- %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
+ %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
+ %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 644c88457714b..8c5bc4a33a303 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 622a335015eba..297e4f0927204 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1093,13 +1093,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 999282bf60539..ffbb9fde26e55 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -906,13 +906,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 27ec1cfadd9d2..de12f2b246f57 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16(
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index e16540fec0229..1a426096da197 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index ae41f4381251d..0f709b044f63a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16(
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index f6e9f152dca5e..51dfbda53ad4c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -736,43 +736,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-TRUE16-LABEL: constant_load_v16i16_align2:
; GFX12-TRUE16: ; %bb.0: ; %entry
; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, 0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_clause 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v9, s[0:1] offset:16
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:12
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v9, s[0:1] offset:8
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:4
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v9, s[0:1] offset:28
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:24
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v9, s[0:1] offset:20
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v8, v9, s[0:1]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:28
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:24
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:20
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] offset:16
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:12
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:8
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:4
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1]
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x3
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l
-; GFX12-TRUE16-NEXT: s_clause 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v9, s[0:1] offset:30
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v9, s[0:1] offset:26
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v9, s[0:1] offset:22
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:18
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v9, s[0:1] offset:14
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v9, s[0:1] offset:10
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:6
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:2
+; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: s_clause 0x1
; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off
-; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX12-TRUE16-NEXT: s_endpgm
;
; GFX12-FAKE16-LABEL: constant_load_v16i16_align2:
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 9dd7b946ff5bd..7339b545686f5 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -880,17 +880,17 @@ define amdgpu_kernel void @select_v2f16(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1066,17 +1066,17 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1245,17 +1245,17 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1428,15 +1428,15 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1609,15 +1609,15 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
More information about the llvm-commits
mailing list