[llvm] [AMDGPU] Generate more efficient code to avoid shift64 bug (PR #171871)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 09:27:50 PST 2025
https://github.com/LU-JOHN created https://github.com/llvm/llvm-project/pull/171871
Generate more efficient code to avoid shift64 bug when dst!=src1.
Transform:
dst = shiftrev64 amt, src1
to:
dst.sub0 =amt
dst = shiftrev64 dst.sub0, src1
>From 1d6642b6e88a60eb5688b9c81faeb82f9558beb1 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 11 Dec 2025 11:25:21 -0600
Subject: [PATCH] Generate more efficient code to avoid shift64 bug
Signed-off-by: John Lu <John.Lu at amd.com>
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 38 +++++++++++++------
llvm/test/CodeGen/AMDGPU/hazard-shift64.mir | 35 +++++------------
2 files changed, 37 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 6f1a5210fb7e0..8c27bc79bf1e0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2217,16 +2217,35 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
return false;
- MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
- bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
- bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
- bool Overlapped = OverlappedSrc || OverlappedDst;
-
- assert(!OverlappedDst || !OverlappedSrc ||
- Src1->getReg() == MI->getOperand(0).getReg());
assert(ST.needsAlignedVGPRs());
static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
+ DebugLoc DL = MI->getDebugLoc();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
+
+ /* In:
+
+ Dst = shiftrev64 Amt, Src1
+
+ if Dst!=Src1 then avoid the bug with:
+
+ Dst.sub0 = Amt
+ Dst = shift64 Dst.sub0, Src1
+
+ */
+ Register DstReg = MI->getOperand(0).getReg();
+ if (!Src1->isReg() || Src1->getReg() != DstReg) {
+ Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
+ runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32))
+ .addReg(DstLo, RegState::Define)
+ .addReg(AmtReg, Amt->isKill() ? RegState::Kill : 0));
+ Amt->setReg(DstLo);
+ Amt->setIsKill(true);
+ return true;
+ }
+
+ bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
Register NewReg;
for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
: AMDGPU::VGPR_32RegClass) {
@@ -2243,8 +2262,6 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (Overlapped)
NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
- DebugLoc DL = MI->getDebugLoc();
- MachineBasicBlock *MBB = MI->getParent();
// Insert a full wait count because found register might be pending a wait.
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
.addImm(0);
@@ -2282,9 +2299,8 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
Amt->setIsKill(false);
// We do not update liveness, so verifier may see it as undef.
Amt->setIsUndef();
- if (OverlappedDst)
+ if (Overlapped) {
MI->getOperand(0).setReg(NewReg);
- if (OverlappedSrc) {
Src1->setReg(NewReg);
Src1->setIsKill(false);
Src1->setIsUndef();
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
index 616fef1117eb2..74c641bebb78b 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
@@ -64,10 +64,8 @@ body: |
; GCN-LABEL: name: highest_reg_shift_amt_used_v0_dst
; GCN: $vgpr7 = IMPLICIT_DEF
; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
- ; GCN-NEXT: S_WAITCNT 0
- ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec
- ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr2_vgpr3, implicit $exec
- ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr0, killed $vgpr2_vgpr3, implicit $exec
$vgpr7 = IMPLICIT_DEF
$vgpr2_vgpr3 = IMPLICIT_DEF
renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
@@ -82,10 +80,8 @@ body: |
; GCN-LABEL: name: highest_reg_shift_amt_used_v0_src
; GCN: $vgpr7 = IMPLICIT_DEF
; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
- ; GCN-NEXT: S_WAITCNT 0
- ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec
- ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr0_vgpr1, implicit $exec
- ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr2, killed $vgpr0_vgpr1, implicit $exec
$vgpr7 = IMPLICIT_DEF
$vgpr0_vgpr1 = IMPLICIT_DEF
renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
@@ -118,12 +114,8 @@ body: |
; GCN-LABEL: name: highest_reg_shift_amt_overlapped_src
; GCN: $vgpr7 = IMPLICIT_DEF
; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
- ; GCN-NEXT: S_WAITCNT 0
- ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 undef $vgpr6, undef $vgpr2, implicit $exec
- ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr3, implicit $exec
- ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 undef $vgpr3, undef $vgpr2_vgpr3, implicit $exec
- ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
- ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr0, killed $vgpr6_vgpr7, implicit $exec
$vgpr7 = IMPLICIT_DEF
$vgpr6_vgpr7 = IMPLICIT_DEF
renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
@@ -138,12 +130,8 @@ body: |
; GCN-LABEL: name: highest_reg_shift_amt_overlapped_dst
; GCN: $vgpr7 = IMPLICIT_DEF
; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
- ; GCN-NEXT: S_WAITCNT 0
- ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 undef $vgpr6, undef $vgpr2, implicit $exec
- ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr3, implicit $exec
- ; GCN-NEXT: $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr3, killed $vgpr0_vgpr1, implicit $exec
- ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
- ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+ ; GCN-NEXT: renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr6, killed $vgpr0_vgpr1, implicit $exec
$vgpr7 = IMPLICIT_DEF
$vgpr0_vgpr1 = IMPLICIT_DEF
renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
@@ -179,11 +167,8 @@ body: |
; GCN: $vgpr7 = IMPLICIT_DEF
; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
; GCN-NEXT: $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_WAITCNT 0
- ; GCN-NEXT: S_NOP 4
- ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec
- ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr0_vgpr1, implicit $exec
- ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr2, killed $vgpr0_vgpr1, implicit $exec
$vgpr7 = IMPLICIT_DEF
$vgpr0_vgpr1 = IMPLICIT_DEF
$vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
More information about the llvm-commits
mailing list