[llvm] [AMDGPU] Select 64-bit moves (PR #70395)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 26 17:12:28 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
This allows folding of 64-bit operands if fit into 32-bit. Fixes https://github.com/llvm/llvm-project/issues/67781
---
Patch is 2.21 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70395.diff
87 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+9-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+8)
- (modified) llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+12-3)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+59-65)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+420-448)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+421-445)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll (+34-18)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir (+61-56)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir (+58-78)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir (+162-168)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir (+166-154)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir (+132-228)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir (+14-28)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir (+36-57)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir (+4-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir (+60-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir (+92-64)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir (+24-30)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir (+394-378)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir (+30-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir (+299-409)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir (+12-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir (+99-21)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir (+10-14)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll (+11-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll (+8-22)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+154-151)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+143-137)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+97-94)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+89-86)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+82-94)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/commute-compares.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/constrained-shift.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll (+326-498)
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+43-72)
- (modified) llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir (+5-1)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll (+141-143)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+147-177)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+147-177)
- (modified) llvm/test/CodeGen/AMDGPU/inline-asm.ll (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+5-4)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+17-20)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+19-23)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+23-29)
- (modified) llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll (+148-174)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll (+60-176)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll (+210-462)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll (+40-80)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll (+150-300)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll (+1-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+47-25)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+39-38)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+426-423)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+264-597)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-global.ll (+110-292)
- (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+32-36)
- (modified) llvm/test/CodeGen/AMDGPU/rsq.f64.ll (+327-322)
- (modified) llvm/test/CodeGen/AMDGPU/salu-to-valu.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+27-32)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+3676-3683)
- (modified) llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll (+163-163)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/swdev380865.ll (+28-80)
- (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+80-78)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+3-3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..804ffb90b530241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -595,11 +595,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
uint64_t Imm;
- if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+ if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
- else {
+ if (AMDGPU::isValid32BitLiteral(Imm, true))
+ break;
+ } else {
ConstantSDNode *C = cast<ConstantSDNode>(N);
Imm = C->getZExtValue();
+ if (AMDGPU::isValid32BitLiteral(Imm, false))
+ break;
}
SDLoc DL(N);
@@ -3014,7 +3018,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
if (!RC || SIRI->isSGPRClass(RC))
return false;
- if (RC != &AMDGPU::VS_32RegClass) {
+ if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
AllUsesAcceptSReg = false;
SDNode * User = *U;
if (User->isMachineOpcode()) {
@@ -3026,7 +3030,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
- if (CommutedRC == &AMDGPU::VS_32RegClass)
+ if (CommutedRC == &AMDGPU::VS_32RegClass ||
+ CommutedRC == &AMDGPU::VS_64RegClass)
AllUsesAcceptSReg = true;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..b68096decbb7db9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2551,11 +2551,13 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineOperand &ImmOp = I.getOperand(1);
Register DstReg = I.getOperand(0).getReg();
unsigned Size = MRI->getType(DstReg).getSizeInBits();
+ bool IsFP = false;
// The AMDGPU backend only supports Imm operands and not CImm or FPImm.
if (ImmOp.isFPImm()) {
const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
ImmOp.ChangeToImmediate(Imm.getZExtValue());
+ IsFP = true;
} else if (ImmOp.isCImm()) {
ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
} else {
@@ -2568,6 +2570,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
unsigned Opcode;
if (DstRB->getID() == AMDGPU::VCCRegBankID) {
Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ } else if (Size == 64 &&
+ AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
+ Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
+ I.setDesc(TII.get(Opcode));
+ I.addImplicitDefUseOperands(*MF);
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
} else {
Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index b32ed9fef5dd34e..b7ac90e33f65e00 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -367,7 +367,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
SMovOp = AMDGPU::S_MOV_B32;
break;
case AMDGPU::V_MOV_B64_PSEUDO:
- SMovOp = AMDGPU::S_MOV_B64;
+ SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO;
break;
}
Imm = ImmOp->getImm();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 827c2c156638468..284943eae46500d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5497,9 +5497,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
- if (Is64BitOp && !AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
- !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm()))
- return false;
+ if (Is64BitOp &&
+ !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
+ if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+ return false;
+
+ // FIXME: We can use sign extended 64-bit literals, but only for signed
+ // operands. At the moment we do not know if an operand is signed.
+ // Such operand will be encoded as its low 32 bits and then either
+ // correctly sign extended or incorrectly zero extended by HW.
+ if (!Is64BitFPOp && (int32_t)Lo_32(Imm) < 0)
+ return false;
+ }
}
// Handle non-register types that are treated like immediates.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 567f1b812c1808c..f93b827ec17ab1f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1934,6 +1934,26 @@ def : GCNPat <
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;
+def : GCNPat <
+ (VGPRImm<(i64 imm)>:$imm),
+ (V_MOV_B64_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+ (VGPRImm<(f64 fpimm)>:$imm),
+ (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
+>;
+
+def : GCNPat <
+ (i64 imm:$imm),
+ (S_MOV_B64_IMM_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+ (f64 fpimm:$imm),
+ (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm)))
+>;
+
def : GCNPat <
(f32 fpimm:$imm),
(S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
index ed525fb83c6de82..621394fd290b0c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
@@ -41,11 +41,12 @@ entry:
}
; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower
+; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8001
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
-; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
+; GFX10: v_{{(dual_)?}}cndmask_b32{{(_e32)?}} [[A:v[0-9]+]], 0x8001, [[A]]
; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
entry:
@@ -56,6 +57,7 @@ entry:
}
; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher
+; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8000
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 701a733d9e8e957..8bf34caea40513d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2090,69 +2090,69 @@ define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
; GCN-LABEL: dyn_extract_v16f64_s_s:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_mov_b32 s66, 0
+; GCN-NEXT: s_mov_b32 s64, 0
+; GCN-NEXT: s_mov_b32 s62, 0
+; GCN-NEXT: s_mov_b32 s60, 0
+; GCN-NEXT: s_mov_b32 s58, 0
+; GCN-NEXT: s_mov_b32 s56, 0
+; GCN-NEXT: s_mov_b32 s54, 0
+; GCN-NEXT: s_mov_b32 s52, 0
+; GCN-NEXT: s_mov_b32 s50, 0
+; GCN-NEXT: s_mov_b32 s48, 0
+; GCN-NEXT: s_mov_b32 s46, 0
+; GCN-NEXT: s_mov_b32 s44, 0
+; GCN-NEXT: s_mov_b32 s40, 0
; GCN-NEXT: s_mov_b64 s[36:37], 1.0
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: s_mov_b32 s67, 0x40300000
; GCN-NEXT: s_mov_b32 s65, 0x402e0000
-; GCN-NEXT: s_mov_b32 s64, s66
; GCN-NEXT: s_mov_b32 s63, 0x402c0000
-; GCN-NEXT: s_mov_b32 s62, s66
; GCN-NEXT: s_mov_b32 s61, 0x402a0000
-; GCN-NEXT: s_mov_b32 s60, s66
; GCN-NEXT: s_mov_b32 s59, 0x40280000
-; GCN-NEXT: s_mov_b32 s58, s66
; GCN-NEXT: s_mov_b32 s57, 0x40260000
-; GCN-NEXT: s_mov_b32 s56, s66
; GCN-NEXT: s_mov_b32 s55, 0x40240000
-; GCN-NEXT: s_mov_b32 s54, s66
; GCN-NEXT: s_mov_b32 s53, 0x40220000
-; GCN-NEXT: s_mov_b32 s52, s66
; GCN-NEXT: s_mov_b32 s51, 0x40200000
-; GCN-NEXT: s_mov_b32 s50, s66
; GCN-NEXT: s_mov_b32 s49, 0x401c0000
-; GCN-NEXT: s_mov_b32 s48, s66
; GCN-NEXT: s_mov_b32 s47, 0x40180000
-; GCN-NEXT: s_mov_b32 s46, s66
; GCN-NEXT: s_mov_b32 s45, 0x40140000
-; GCN-NEXT: s_mov_b32 s44, s66
; GCN-NEXT: s_mov_b64 s[42:43], 4.0
; GCN-NEXT: s_mov_b32 s41, 0x40080000
-; GCN-NEXT: s_mov_b32 s40, s66
; GCN-NEXT: s_mov_b64 s[38:39], 2.0
; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37]
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: dyn_extract_v16f64_s_s:
; GFX10PLUS: ; %bb.0: ; %entry
-; GFX10PLUS-NEXT: s_mov_b32 s66, 0
; GFX10PLUS-NEXT: s_mov_b64 s[36:37], 1.0
; GFX10PLUS-NEXT: s_mov_b32 m0, s2
+; GFX10PLUS-NEXT: s_mov_b32 s66, 0
+; GFX10PLUS-NEXT: s_mov_b32 s64, 0
+; GFX10PLUS-NEXT: s_mov_b32 s62, 0
+; GFX10PLUS-NEXT: s_mov_b32 s60, 0
+; GFX10PLUS-NEXT: s_mov_b32 s58, 0
+; GFX10PLUS-NEXT: s_mov_b32 s56, 0
+; GFX10PLUS-NEXT: s_mov_b32 s54, 0
+; GFX10PLUS-NEXT: s_mov_b32 s52, 0
+; GFX10PLUS-NEXT: s_mov_b32 s50, 0
+; GFX10PLUS-NEXT: s_mov_b32 s48, 0
+; GFX10PLUS-NEXT: s_mov_b32 s46, 0
+; GFX10PLUS-NEXT: s_mov_b32 s44, 0
+; GFX10PLUS-NEXT: s_mov_b32 s40, 0
; GFX10PLUS-NEXT: s_mov_b32 s67, 0x40300000
; GFX10PLUS-NEXT: s_mov_b32 s65, 0x402e0000
-; GFX10PLUS-NEXT: s_mov_b32 s64, s66
; GFX10PLUS-NEXT: s_mov_b32 s63, 0x402c0000
-; GFX10PLUS-NEXT: s_mov_b32 s62, s66
; GFX10PLUS-NEXT: s_mov_b32 s61, 0x402a0000
-; GFX10PLUS-NEXT: s_mov_b32 s60, s66
; GFX10PLUS-NEXT: s_mov_b32 s59, 0x40280000
-; GFX10PLUS-NEXT: s_mov_b32 s58, s66
; GFX10PLUS-NEXT: s_mov_b32 s57, 0x40260000
-; GFX10PLUS-NEXT: s_mov_b32 s56, s66
; GFX10PLUS-NEXT: s_mov_b32 s55, 0x40240000
-; GFX10PLUS-NEXT: s_mov_b32 s54, s66
; GFX10PLUS-NEXT: s_mov_b32 s53, 0x40220000
-; GFX10PLUS-NEXT: s_mov_b32 s52, s66
; GFX10PLUS-NEXT: s_mov_b32 s51, 0x40200000
-; GFX10PLUS-NEXT: s_mov_b32 s50, s66
; GFX10PLUS-NEXT: s_mov_b32 s49, 0x401c0000
-; GFX10PLUS-NEXT: s_mov_b32 s48, s66
; GFX10PLUS-NEXT: s_mov_b32 s47, 0x40180000
-; GFX10PLUS-NEXT: s_mov_b32 s46, s66
; GFX10PLUS-NEXT: s_mov_b32 s45, 0x40140000
-; GFX10PLUS-NEXT: s_mov_b32 s44, s66
; GFX10PLUS-NEXT: s_mov_b64 s[42:43], 4.0
; GFX10PLUS-NEXT: s_mov_b32 s41, 0x40080000
-; GFX10PLUS-NEXT: s_mov_b32 s40, s66
; GFX10PLUS-NEXT: s_mov_b64 s[38:39], 2.0
; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[36:37]
; GFX10PLUS-NEXT: ; return to shader part epilog
@@ -3085,10 +3085,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8
+; GPRIDX-NEXT: s_mov_b32 s4, 0
+; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000
; GPRIDX-NEXT: s_mov_b32 s2, 0
; GPRIDX-NEXT: s_mov_b32 s3, 0x40140000
-; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000
-; GPRIDX-NEXT: s_mov_b32 s4, s2
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1
; GPRIDX-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3176,10 +3176,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8
+; MOVREL-NEXT: s_mov_b32 s4, 0
+; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
; MOVREL-NEXT: s_mov_b32 s2, 0
; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
-; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT: s_mov_b32 s4, s2
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3207,7 +3207,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -3250,7 +3250,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 12
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 9
+; GFX10-NEXT: wavefront_sgpr_count = 7
; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
@@ -3267,22 +3267,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: .end_amd_kernel_code_t
; GFX10-NEXT: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: s_mov_b32 s3, 0x40140000
-; GFX10-NEXT: s_mov_b32 s5, 0x40080000
-; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: s_mov_b32 s3, 0x40080000
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s8, 1
-; GFX10-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
-; GFX10-NEXT: s_cmp_eq_u32 s8, 2
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX10-NEXT: s_cmp_eq_u32 s8, 3
-; GFX10-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
-; GFX10-NEXT: s_cmp_eq_u32 s8, 4
+; GFX10-NEXT: s_cmp_eq_u32 s6, 1
+; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
+; GFX10-NEXT: s_cmp_eq_u32 s6, 2
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s5, 0x40140000
+; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GFX10-NEXT: s_cmp_eq_u32 s6, 4
+; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3299,7 +3299,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 12
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 9
+; GFX11-NEXT: wavefront_sgpr_count = 7
; GFX11-NEXT: workitem_vgpr_count = 3
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
@@ -3359,22 +3359,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: .end_amd_kernel_code_t
; GFX11-NEXT: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x8
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_mov_b32 s3, 0x40140000
-; GFX11-NEXT: s_mov_b32 s5, 0x40080000
-; GFX11-NEXT: s_mov_b32 s4, s2
+; GFX11-NEXT: s_mov_b32 s3, 0x40080000
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s8, 1
-; GFX11-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
-; GFX11-NEXT: s_cmp_eq_u32 s8, 2
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX11-NEXT: s_cmp_eq_u32 s8, 3
-; GFX11-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
-; GFX11-NEXT: s_cmp_eq_u32 s8, 4
+; GFX11-NEXT: s_cmp_eq_u32 s6, 1
+; GFX11-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
+; GFX11-NEXT: s_cmp_eq_u32 s6, 2
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s5, 0x40140000
+; GFX11-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GFX11-NEXT: s_cmp_eq_u32 s6, 4
+; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
@@ -4784,11 +4784,8 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
; MOVREL-LABEL: v_extract_v64i32_32:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT: v_mov_b32_e32 v2, s4
-; MOVREL-NEXT: v_mov_b32_e32 v3, s5
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
; MOVREL-NEXT: s_setpc_b64 s[30:31]
@@ -4823,11 +4820,8 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
; MOVREL-LABEL: v_extract_v64i32_33:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT: v_mov_b32_e32 v2, s4
-; MOVREL-NEXT: v_mov_b32_e32 v3, s5
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
; MOVREL-NEXT: v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 66bff4a14cac84b..c6ea046f95a9199 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1473,12 +1473,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
@@ -1504,12 +1504,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX940-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/70395
More information about the llvm-commits
mailing list