[llvm] [AMDGPU] Generate more efficient code to avoid shift64 bug (PR #171871)

via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 12 06:54:28 PST 2025


https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/171871

>From 1d6642b6e88a60eb5688b9c81faeb82f9558beb1 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 11 Dec 2025 11:25:21 -0600
Subject: [PATCH 1/4] Generate more efficient code to avoid shift64 bug

Signed-off-by: John Lu <John.Lu at amd.com>
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 38 +++++++++++++------
 llvm/test/CodeGen/AMDGPU/hazard-shift64.mir   | 35 +++++------------
 2 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 6f1a5210fb7e0..8c27bc79bf1e0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2217,16 +2217,35 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
     return false;
 
-  MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
-  bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
-  bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
-  bool Overlapped = OverlappedSrc || OverlappedDst;
-
-  assert(!OverlappedDst || !OverlappedSrc ||
-         Src1->getReg() == MI->getOperand(0).getReg());
   assert(ST.needsAlignedVGPRs());
   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
 
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
+
+  /* In:
+
+        Dst = shiftrev64 Amt, Src1
+
+     if  Dst!=Src1 then avoid the bug with:
+
+        Dst.sub0 = Amt
+        Dst = shift64 Dst.sub0, Src1
+
+  */
+  Register DstReg = MI->getOperand(0).getReg();
+  if (!Src1->isReg() || Src1->getReg() != DstReg) {
+    Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
+    runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32))
+                         .addReg(DstLo, RegState::Define)
+                         .addReg(AmtReg, Amt->isKill() ? RegState::Kill : 0));
+    Amt->setReg(DstLo);
+    Amt->setIsKill(true);
+    return true;
+  }
+
+  bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
   Register NewReg;
   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
                                    : AMDGPU::VGPR_32RegClass) {
@@ -2243,8 +2262,6 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (Overlapped)
     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
 
-  DebugLoc DL = MI->getDebugLoc();
-  MachineBasicBlock *MBB = MI->getParent();
   // Insert a full wait count because found register might be pending a wait.
   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
       .addImm(0);
@@ -2282,9 +2299,8 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   Amt->setIsKill(false);
   // We do not update liveness, so verifier may see it as undef.
   Amt->setIsUndef();
-  if (OverlappedDst)
+  if (Overlapped) {
     MI->getOperand(0).setReg(NewReg);
-  if (OverlappedSrc) {
     Src1->setReg(NewReg);
     Src1->setIsKill(false);
     Src1->setIsUndef();
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
index 616fef1117eb2..74c641bebb78b 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
@@ -64,10 +64,8 @@ body:             |
     ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_dst
     ; GCN: $vgpr7 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
-    ; GCN-NEXT: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr2_vgpr3, implicit $exec
-    ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr0, killed $vgpr2_vgpr3, implicit $exec
     $vgpr7 = IMPLICIT_DEF
     $vgpr2_vgpr3 = IMPLICIT_DEF
     renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
@@ -82,10 +80,8 @@ body:             |
     ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_src
     ; GCN: $vgpr7 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; GCN-NEXT: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr2, killed $vgpr0_vgpr1, implicit $exec
     $vgpr7 = IMPLICIT_DEF
     $vgpr0_vgpr1 = IMPLICIT_DEF
     renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
@@ -118,12 +114,8 @@ body:             |
     ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_src
     ; GCN: $vgpr7 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
-    ; GCN-NEXT: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 undef $vgpr6, undef $vgpr2, implicit $exec
-    ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr3, implicit $exec
-    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 undef $vgpr3, undef $vgpr2_vgpr3, implicit $exec
-    ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
-    ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr0, killed $vgpr6_vgpr7, implicit $exec
     $vgpr7 = IMPLICIT_DEF
     $vgpr6_vgpr7 = IMPLICIT_DEF
     renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
@@ -138,12 +130,8 @@ body:             |
     ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_dst
     ; GCN: $vgpr7 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; GCN-NEXT: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 undef $vgpr6, undef $vgpr2, implicit $exec
-    ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr3, implicit $exec
-    ; GCN-NEXT: $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr3, killed $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
-    ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+    ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+    ; GCN-NEXT: renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr6, killed $vgpr0_vgpr1, implicit $exec
     $vgpr7 = IMPLICIT_DEF
     $vgpr0_vgpr1 = IMPLICIT_DEF
     renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
@@ -179,11 +167,8 @@ body:             |
     ; GCN: $vgpr7 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_WAITCNT 0
-    ; GCN-NEXT: S_NOP 4
-    ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec
-    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr0_vgpr1, implicit $exec
-    ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr7, implicit $exec
+    ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr2, killed $vgpr0_vgpr1, implicit $exec
     $vgpr7 = IMPLICIT_DEF
     $vgpr0_vgpr1 = IMPLICIT_DEF
     $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec

>From 1cefc4eade2f1ca5d519aee83d0f2fa5017dd004 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 11 Dec 2025 17:31:20 -0600
Subject: [PATCH 2/4] Address feedback

Signed-off-by: John Lu <John.Lu at amd.com>
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8c27bc79bf1e0..5c45f1b2a85c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2220,7 +2220,7 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   assert(ST.needsAlignedVGPRs());
   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
 
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc DL = MI->getDebugLoc();
   MachineBasicBlock *MBB = MI->getParent();
   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
 
@@ -2237,9 +2237,9 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   Register DstReg = MI->getOperand(0).getReg();
   if (!Src1->isReg() || Src1->getReg() != DstReg) {
     Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
-    runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32))
-                         .addReg(DstLo, RegState::Define)
-                         .addReg(AmtReg, Amt->isKill() ? RegState::Kill : 0));
+    runOnInstruction(
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo)
+            .addReg(AmtReg, Amt->isKill() ? RegState::Kill : 0));
     Amt->setReg(DstLo);
     Amt->setIsKill(true);
     return true;

>From 371a27bc7186a71cb0e6d019af3d903d9302f7ec Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 11 Dec 2025 18:17:58 -0600
Subject: [PATCH 3/4] Use C++ style comments

Signed-off-by: John Lu <John.Lu at amd.com>
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 5c45f1b2a85c4..64cceb45abbae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2224,16 +2224,15 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   MachineBasicBlock *MBB = MI->getParent();
   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
 
-  /* In:
-
-        Dst = shiftrev64 Amt, Src1
-
-     if  Dst!=Src1 then avoid the bug with:
-
-        Dst.sub0 = Amt
-        Dst = shift64 Dst.sub0, Src1
+  // In:
+  //
+  //    Dst = shiftrev64 Amt, Src1
+  //
+  // if  Dst!=Src1 then avoid the bug with:
+  //
+  //    Dst.sub0 = Amt
+  //    Dst = shift64 Dst.sub0, Src1
 
-  */
   Register DstReg = MI->getOperand(0).getReg();
   if (!Src1->isReg() || Src1->getReg() != DstReg) {
     Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);

>From 1d11c380cba3d6e47f28a44c1496b7badd087dc6 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 12 Dec 2025 08:54:07 -0600
Subject: [PATCH 4/4] Simplify code

Signed-off-by: John Lu <John.Lu at amd.com>
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 64cceb45abbae..52abcac6cea5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2237,8 +2237,7 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (!Src1->isReg() || Src1->getReg() != DstReg) {
     Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
     runOnInstruction(
-        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo)
-            .addReg(AmtReg, Amt->isKill() ? RegState::Kill : 0));
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
     Amt->setReg(DstLo);
     Amt->setIsKill(true);
     return true;



More information about the llvm-commits mailing list