[llvm] 95d497f - [AMDGPU] W/a hazard if 64 bit shift amount is a highest allocated VGPR
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 7 14:24:05 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-09-07T14:23:49-07:00
New Revision: 95d497ff2a765ec1038f5670a07369e1c4f639c4
URL: https://github.com/llvm/llvm-project/commit/95d497ff2a765ec1038f5670a07369e1c4f639c4
DIFF: https://github.com/llvm/llvm-project/commit/95d497ff2a765ec1038f5670a07369e1c4f639c4.diff
LOG: [AMDGPU] W/a hazard if 64 bit shift amount is a highest allocated VGPR
In this case gfx90a uses v0 instead of the correct register. Swap
the value temporarily with a lower register and then swap it back.
Unfortunately hazard recognizer works after wait count insertion,
so we cannot simply reuse an arbitrary register, hence w/a also
includes a full waitcount. This can be avoided if we run it from
expandPostRAPseudo, but that is a complete misplacement.
Differential Revision: https://reviews.llvm.org/D133067
Added:
llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
llvm/lib/Target/AMDGPU/GCNSubtarget.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 9416328bd0ce1..e454d7f34dce8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -300,6 +300,20 @@ void GCNHazardRecognizer::processBundle() {
CurrCycleInstr = nullptr;
}
+void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
+ assert(IsHazardRecognizerMode);
+
+ unsigned NumPreNoops = PreEmitNoops(MI);
+ EmitNoops(NumPreNoops);
+ if (MI->isInsideBundle())
+ insertNoopsInBundle(MI, TII, NumPreNoops);
+ else
+ TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
+ NumPreNoops);
+ EmitInstruction(MI);
+ AdvanceCycle();
+}
+
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
IsHazardRecognizerMode = true;
CurrCycleInstr = MI;
@@ -1087,6 +1101,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
fixWMMAHazards(MI);
+ fixShift64HighRegBug(MI);
}
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -1739,6 +1754,105 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
return true;
}
+bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
+ if (!ST.hasShift64HighRegBug())
+ return false;
+
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case AMDGPU::V_LSHLREV_B64_e64:
+ case AMDGPU::V_LSHRREV_B64_e64:
+ case AMDGPU::V_ASHRREV_I64_e64:
+ break;
+ }
+
+ MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
+ if (!Amt->isReg())
+ return false;
+
+ Register AmtReg = Amt->getReg();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ // Check if this is a last VGPR in the allocation block.
+ if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
+ return false;
+
+ if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
+ return false;
+
+ MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
+ bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
+ bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
+ bool Overlapped = OverlappedSrc || OverlappedDst;
+
+ assert(!OverlappedDst || !OverlappedSrc ||
+ Src1->getReg() == MI->getOperand(0).getReg());
+ assert(ST.needsAlignedVGPRs());
+ static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
+
+ Register NewReg;
+ for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
+ : AMDGPU::VGPR_32RegClass) {
+ if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
+ NewReg = Reg;
+ break;
+ }
+ }
+
+ Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
+ : NewReg;
+ Register NewAmtLo;
+
+ if (Overlapped)
+ NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
+
+ DebugLoc DL = MI->getDebugLoc();
+ MachineBasicBlock *MBB = MI->getParent();
+ // Insert a full wait count because found register might be pending a wait.
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
+ if (Overlapped)
+ runOnInstruction(
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
+ .addDef(AmtReg - 1)
+ .addReg(AmtReg - 1)
+ .addReg(NewAmtLo));
+ runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
+ .addDef(AmtReg)
+ .addReg(AmtReg)
+ .addReg(NewAmt));
+
+ // Instructions emitted after the current instruction will be processed by the
+ // parent loop of the hazard recognizer in a natural way.
+ BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
+ AmtReg)
+ .addDef(NewAmt)
+ .addReg(NewAmt)
+ .addReg(AmtReg);
+ if (Overlapped)
+ BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
+ AmtReg - 1)
+ .addDef(NewAmtLo)
+ .addReg(NewAmtLo)
+ .addReg(AmtReg - 1);
+
+ // Re-running hazard recognizer on the modified instruction is not necessary,
+ // inserted V_SWAP_B32 has already both read and write new registers so
+ // hazards related to these register has already been handled.
+ Amt->setReg(NewAmt);
+ Amt->setIsKill(false);
+ if (OverlappedDst)
+ MI->getOperand(0).setReg(NewReg);
+ if (OverlappedSrc) {
+ Src1->setReg(NewReg);
+ Src1->setIsKill(false);
+ }
+
+ return true;
+}
+
int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
int NSAtoVMEMWaitStates = 1;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 57f5a04c6eda9..ac19b04fdbc74 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -70,6 +70,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
// instructions.
void processBundle();
+ // Run on an individual instruction in hazard recognizer mode. This can be
+ // used on a newly inserted instruction before returning from PreEmitNoops.
+ void runOnInstruction(MachineInstr *MI);
+
int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
@@ -101,6 +105,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
+ bool fixShift64HighRegBug(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b4a0dd7986515..d161f4aa13ec5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1008,6 +1008,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasLdsBranchVmemWARHazard;
}
+ // Shift amount of a 64 bit shift cannot be a highest allocated register
+ // if also at the end of the allocation block.
+ bool hasShift64HighRegBug() const {
+ return GFX90AInsts && !GFX940Insts;
+ }
+
// Has one cycle hazard on transcendental instruction feeding a
// non transcendental VALU.
bool hasTransForwardingHazard() const { return GFX940Insts; }
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
new file mode 100644
index 0000000000000..c4cd527823b89
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir
@@ -0,0 +1,250 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: highest_reg_shift_amt_v7
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v7
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_v15
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v15
+ ; GCN: $vgpr15 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr15 = V_SWAP_B32 $vgpr15, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr15, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr15, implicit $exec
+ $vgpr15 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr15, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_v255
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v255
+ ; GCN: $vgpr255 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr255 = V_SWAP_B32 $vgpr255, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr255, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr255, implicit $exec
+ $vgpr255 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr255, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_used_v0_dst
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_dst
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_used_v0_src
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_src
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_used_v0_both
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_both
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr2, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr2, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_overlapped_src
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_src
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr2, implicit $exec
+ ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr3, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr3, $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_overlapped_dst
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_dst
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr2, implicit $exec
+ ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr3, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_overlapped_both
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_both
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec
+ ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_hazard_in_swap
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: S_NOP 4
+ ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_hazard_in_swap2
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap2
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_v7_bundle
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v7_bundle
+ ; GCN: BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr7 {
+ ; GCN-NEXT: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr7, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr7, implicit $exec
+ BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr7 {
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
+ }
+...
+
+---
+name: highest_reg_shift_amt_hazard_in_swap2_bundle
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap2_bundle
+ ; GCN: BUNDLE implicit-def $vgpr1, implicit-def $vgpr6_vgpr7 {
+ ; GCN-NEXT: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec
+ BUNDLE implicit-def $vgpr1, implicit-def $vgpr6_vgpr7 {
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+ }
+...
More information about the llvm-commits
mailing list