[llvm] a6e353b - AMDGPU: Split large offsets when selecting global saddr mode
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 16 08:36:09 PST 2020
Author: Matt Arsenault
Date: 2020-11-16T11:36:01-05:00
New Revision: a6e353b1d0831867225825d8580bad48174577ae
URL: https://github.com/llvm/llvm-project/commit/a6e353b1d0831867225825d8580bad48174577ae
DIFF: https://github.com/llvm/llvm-project/commit/a6e353b1d0831867225825d8580bad48174577ae.diff
LOG: AMDGPU: Split large offsets when selecting global saddr mode
When the offset doesn't fit in the immediate field, move some to
voffset.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
llvm/test/CodeGen/AMDGPU/global_atomics.ll
llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
llvm/test/CodeGen/AMDGPU/offset-split-global.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 33c5430c7cda..339d6fb42f96 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1828,10 +1828,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- // TODO: Could split larger constant into VGPR offset.
if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
Addr = LHS;
ImmOffset = COffsetVal;
+ } else if (!LHS->isDivergent() && COffsetVal > 0) {
+ SDLoc SL(N);
+ // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset)
+ = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
+
+ if (isUInt<32>(RemainderOffset)) {
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
+ SAddr = LHS;
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e5cbcb3ccdb7..feebe259fe9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2294,6 +2294,10 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
getAddrModeInfo(*PtrMI, MRI, AddrInfo);
}
+bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
+ return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
+}
+
bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
if (!MI.hasOneMemOperand())
return false;
@@ -3480,29 +3484,64 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+ Register Addr = Root.getReg();
Register PtrBase;
- int64_t ImmOffset;
+ int64_t ConstOffset;
+ int64_t ImmOffset = 0;
// Match the immediate offset first, which canonically is moved as low as
// possible.
- std::tie(PtrBase, ImmOffset) = getPtrBaseWithConstantOffset(Root.getReg(),
- *MRI);
-
- // TODO: Could split larger constant into VGPR offset.
- if (ImmOffset != 0 &&
- !TII.isLegalFLATOffset(ImmOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
- PtrBase = Root.getReg();
- ImmOffset = 0;
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+ if (ConstOffset != 0) {
+ if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ Addr = PtrBase;
+ ImmOffset = ConstOffset;
+ } else if (ConstOffset > 0) {
+ auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
+ if (!PtrBaseDef)
+ return None;
+
+ if (isSGPR(PtrBaseDef->Reg)) {
+ // Offset is too large.
+ //
+ // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
+ // + (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset)
+ = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
+
+ if (isUInt<32>(RemainderOffset)) {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register HighBits
+ = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ HighBits)
+ .addImm(RemainderOffset);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ }};
+ }
+ }
+ }
}
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (!AddrDef)
+ return None;
+
// Match the variable offset.
- const MachineInstr *PtrBaseDef = getDefIgnoringCopies(PtrBase, *MRI);
- if (PtrBaseDef->getOpcode() != AMDGPU::G_PTR_ADD)
+ if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
return None;
// Look through the SGPR->VGPR copy.
Register PtrBaseSrc =
- getSrcRegIgnoringCopies(PtrBaseDef->getOperand(1).getReg(), *MRI);
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
if (!PtrBaseSrc)
return None;
@@ -3511,7 +3550,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
return None;
Register SAddr = PtrBaseSrc;
- Register PtrBaseOffset = PtrBaseDef->getOperand(2).getReg();
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 9b9a5318538c..c575e7e9c8a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -72,6 +72,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
};
+ bool isSGPR(Register Reg) const;
+
bool isInstrUniform(const MachineInstr &MI) const;
bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
index f7940e60e8c1..18ec87e178e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
@@ -800,20 +800,10 @@ body: |
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
- ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
- ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX10: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
- ; GFX10: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
- ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc
- ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def $scc, implicit $scc
- ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
- ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX10: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
- ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
+ ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
+ ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
%0:sgpr(p1) = COPY $sgpr0_sgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s32) = COPY $vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index 4f289d555913..23ba321f9fc8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -2,9 +2,7 @@
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
-# TODO: Better to initialize 0 vgpr and use sgpr base
---
-
name: load_global_s32_from_sgpr
legalized: true
regBankSelected: true
@@ -273,3 +271,449 @@ body: |
$vgpr0 = COPY %7
...
+---
+name: load_global_s32_from_sgpr_base_offset_4096
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4096
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+ ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4096
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+ ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 4096
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+
+---
+name: load_global_s32_from_sgpr_base_offset_4097
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4097
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+ ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4097
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+ ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 4097
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+
+---
+name: load_global_s32_from_sgpr_base_offset_neg4097
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
+ ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
+ ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 -4097
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+
+---
+name: load_global_s32_from_sgpr_base_offset_2049
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2049
+ ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
+ ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 2049
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+
+---
+name: load_global_s32_from_sgpr_base_offset_neg2049
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247
+ ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247
+ ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 -2049
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+---
+name: load_global_s32_from_sgpr_base_offset_4294967295
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
+ ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
+ ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2047, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 4294967295
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+---
+name: load_global_s32_from_sgpr_base_offset_4294967296
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 4294967296
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+
+---
+name: load_global_s32_from_sgpr_base_offset_4294971390
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
+ ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
+ ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 4294971390
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+
+---
+name: load_global_s32_from_sgpr_base_offset_neg4294967295
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 -4294967295
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
+---
+name: load_global_s32_from_sgpr_base_offset_neg4294967296
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296
+ ; GFX9: liveins: $sgpr0_sgpr1
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296
+ ; GFX10: liveins: $sgpr0_sgpr1
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
+ ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:sgpr(p1) = COPY $sgpr0_sgpr1
+ %1:sgpr(s64) = G_CONSTANT i64 -4294967296
+ %2:sgpr(p1) = G_PTR_ADD %0, %1
+ %3:vgpr(p1) = COPY %2
+ %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 8eee8a80eb82..b46ac1bd8fd5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -72,16 +72,13 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(i32 addrspace(1)*
; GCN-LABEL: global_atomic_csub_sgpr_base_offset:
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, 0x1000
; GCN-NEXT: ; implicit-def: $vcc_hi
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_add_u32 s0, s0, 0x1000
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_endpgm
@@ -95,15 +92,12 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(i32 addrspa
; GCN-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, 0x1000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_add_u32 s0, s0, 0x1000
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
; GCN-NEXT: s_endpgm
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data)
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index 7d3839d213b8..1d79f88c0094 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -40,11 +40,9 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add
; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off offset:28 glc
; GCN-NEXT: BB0_2: ; %endif
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_co_u32_e64 v1, s0, 0x3d0800, s0
-; GCN-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, s1, s0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dword v[1:2], v0, off offset:252
+; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0800
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252
; GCN-NEXT: s_endpgm
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index e14a35e15082..22dde27d0038 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -40,13 +40,9 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
; GCN-NEXT: global_load_dword v0, v[0:1], off
; GCN-NEXT: BB0_2: ; %endif
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: v_add_co_u32_e32 v1, vcc, 0x3d0000, v1
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dword v[1:2], v0, off offset:2300
+; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300
; GCN-NEXT: s_endpgm
entry:
%out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index bb621b294bbc..c803b26a969a 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -45,9 +45,8 @@ done:
; GCN: s_and_saveexec_b64
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
-; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xf000,
-; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0,
-; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
+; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0xf000{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
; GCN: {{^}}BB1_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -103,7 +102,8 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
; GCN: s_and_saveexec_b64
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
-; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x1000{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}}
; GCN: {{^}}BB3_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 60cc73c03366..b9760a6cba1a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4,6 +4,463 @@
; Test using saddr addressing mode of global_*load_* flat instructions.
+; --------------------------------------------------------------------------------
+; No vgpr offset, constants
+; --------------------------------------------------------------------------------
+
+; SGPR base with maximum gfx9 immediate offset
+define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_4095:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_4095:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx9 immediate offset + 1
+define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) {
+; GCN-LABEL: global_load_saddr_i8_offset_4096:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
+; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx9 immediate offset + 2
+define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) {
+; GCN-LABEL: global_load_saddr_i8_offset_4097:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
+; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx9 immediate offset
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg4096:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx9 immediate offset -1
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg4097:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg4097:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx9 immediate offset -2
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg4098:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg4098:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx10 immediate offset
+define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_2048:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_2048:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx10 immediate offset + 1
+define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_2049:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2049
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_2049:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx10 immediate offset + 2
+define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_2050:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2050
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_2050:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx10 immediate offset
+define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) {
+; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx10 immediate offset - 1
+define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg2049:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff800, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx10 immediate offset - 1
+define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2050
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg2050:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff800, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_4294967295:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x800, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x1000, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x800, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297
+ %load = load i8, i8 addrspace(1)* %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
; --------------------------------------------------------------------------------
; Basic addressing patterns
; --------------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index dc014ac770b7..7bd2fd270747 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -25,9 +25,8 @@ entry:
; SIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0
; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x8000,
-; GFX9-NEXT: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
-; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:3232{{$}}
+; GFX9: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000{{$}}
+; GFX9: global_atomic_add [[OFFSET]], v{{[0-9]+}}, s{{\[[0-9]:[0-9]+\]}} offset:3232{{$}}
define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index bfbcfc8bc7fe..88bec6cf66df 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -991,9 +991,8 @@ entry:
; CIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x11940
; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x11000,
-; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
-; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:2368{{$}}
+; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x11000{{$}}
+; GFX9: global_atomic_cmpswap_x2 [[VOFFSET]], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:2368{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 731a95b2a3e9..3b0795d25a14 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -701,11 +701,10 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p
; GFX10-LABEL: global_inst_salu_offset_12bit_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
@@ -719,12 +718,9 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p
; GFX9-LABEL: global_inst_salu_offset_13bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
@@ -732,11 +728,10 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p
; GFX10-LABEL: global_inst_salu_offset_13bit_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
@@ -850,11 +845,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)*
; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
@@ -868,12 +862,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)*
; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
@@ -881,11 +872,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)*
; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
@@ -899,12 +889,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)*
; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
@@ -912,11 +899,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)*
; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x3800, s0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
More information about the llvm-commits
mailing list