[llvm] [AMDGPU][GlobalISel] Expand SGPR S1 exts into G_SELECT (PR #68858)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 12 00:01:21 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
This patch prevents most SGPR S1 G_SZA_EXT from reaching ISel by lowering them to a G_SELECT in RegBankSelect.
It also adds some new ISel logic to make G_SELECT 1,0 be lowered to a simple SCC copy. This is because a copy of SCC is already a `s_cselect x, 1, 0`. Without that we get some regressions.
---
Patch is 80.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68858.diff
22 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+11-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+32-16)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+48-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll (-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll (-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+106-106)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir (+28-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-freeze.mir (+4-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir (+28-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir (+30-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll (-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll (+32-48)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll (+56-84)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 8d4cad4c07bc74c..16f82b50f6dec01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -133,6 +133,15 @@ def expand_promoted_fmed3 : GICombineRule<
} // End Predicates = [NotHasMed3_16]
+// Form G_AMDGPU_SGPR_S1_TO_VCC_COPY from (copy (trunc)) with the right regbanks.
+def fold_sgpr_s1_to_vcc_copy : GICombineRule<
+ (defs root:$dst),
+ (match (COPY $dst, $tmp),
+ (G_TRUNC $tmp, $src),
+ [{ return Helper.matchExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
+ (apply [{ Helper.applyExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])
+>;
+
// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -155,7 +164,8 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
- [unmerge_merge, unmerge_cst, unmerge_undef,
+ [unmerge_merge, unmerge_cst, unmerge_undef, trunc_ext_fold,
+ anyext_trunc_fold, select_constant_cmp,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..85b5b5988619905 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2153,24 +2153,40 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
assert(Size <= 32 || Size == 64);
const MachineOperand &CCOp = I.getOperand(1);
Register CCReg = CCOp.getReg();
+
+ Register TrueVal = I.getOperand(2).getReg();
+ Register FalseVal = I.getOperand(3).getReg();
if (!isVCC(CCReg, *MRI)) {
unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
AMDGPU::S_CSELECT_B32;
MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
.addReg(CCReg);
- // The generic constrainSelectedInstRegOperands doesn't work for the scc register
- // bank, because it does not cover the register class that we used to represent
- // for it. So we need to manually set the register class here.
- if (!MRI->getRegClassOrNull(CCReg))
- MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
- MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
- .add(I.getOperand(2))
- .add(I.getOperand(3));
+ bool Ret = constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+
+ // select 1, 0 is just a copy SCC.
+ if (getIConstantVRegVal(TrueVal, *MRI) == 1 &&
+ getIConstantVRegVal(FalseVal, *MRI) == 0) {
+ // FIXME: Do we need to have two copies or could we get away with just
+ // returning CCReg?
+ MachineInstr *RetCopy =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(AMDGPU::SCC);
+ Ret |= constrainSelectedInstRegOperands(*RetCopy, TII, TRI, RBI);
+ } else {
+ // The generic constrainSelectedInstRegOperands doesn't work for the scc
+ // register bank, because it does not cover the register class that we
+ // used to represent for it. So we need to manually set the register
+ // class here.
+ if (!MRI->getRegClassOrNull(CCReg))
+ MRI->setRegClass(CCReg,
+ TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
+ MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
+ .addReg(TrueVal)
+ .addReg(FalseVal);
+ Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ }
- bool Ret = false;
- Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
- Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
I.eraseFromParent();
return Ret;
}
@@ -2181,11 +2197,11 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
MachineInstr *Select =
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
- .addImm(0)
- .add(I.getOperand(3))
- .addImm(0)
- .add(I.getOperand(2))
- .add(I.getOperand(1));
+ .addImm(0)
+ .addReg(FalseVal)
+ .addImm(0)
+ .addReg(TrueVal)
+ .add(I.getOperand(1));
bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
I.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..402ee5cfbb5a053 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2556,16 +2556,62 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_SEXT:
case AMDGPU::G_ZEXT:
case AMDGPU::G_ANYEXT: {
+ Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
const bool Signed = Opc == AMDGPU::G_SEXT;
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+
assert(OpdMapper.getVRegs(1).empty());
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
- Register DstReg = MI.getOperand(0).getReg();
+ LLT SelType = MRI.getType(MI.getOperand(0).getReg());
+
+ // Extending SGPR S1 to S16/32/64.
+ if (SrcBank == &AMDGPU::SGPRRegBank &&
+ MRI.getType(SrcReg) == LLT::scalar(1) &&
+ (SelType == S32 || SelType == S16 || SelType == S64)) {
+
+ Register False = B.buildConstant(S32, 0).getReg(0);
+ MRI.setRegBank(False, AMDGPU::SGPRRegBank);
+
+ Register True = Signed ? B.buildConstant(S32, -1).getReg(0)
+ : B.buildConstant(S32, 1).getReg(0);
+ MRI.setRegBank(True, AMDGPU::SGPRRegBank);
+
+ B.setInstrAndDebugLoc(MI);
+ Register NewReg = MRI.createGenericVirtualRegister(S32);
+ B.buildInstr(AMDGPU::G_ANYEXT, {NewReg}, {MI.getOperand(1).getReg()});
+ MRI.setRegBank(NewReg, AMDGPU::SGPRRegBank);
+
+ if (SelType == S32) {
+ B.buildSelect(DstReg, NewReg, True, False);
+ } else if (SelType == S16) {
+ Register TmpReg = B.buildSelect(S32, NewReg, True, False).getReg(0);
+ B.buildTrunc(DstReg, TmpReg);
+ MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
+
+ } else if (SelType == S64) {
+ Register TmpReg = B.buildSelect(S32, NewReg, True, False).getReg(0);
+ MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
+
+ Register HiPart = Signed ? TmpReg : B.buildConstant(S32, 0).getReg(0);
+ MRI.setRegBank(HiPart, AMDGPU::SGPRRegBank);
+
+ B.buildMergeLikeInstr(DstReg, {TmpReg, HiPart});
+ } else
+ llvm_unreachable("bad type");
+
+ MRI.setRegBank(DstReg, *SrcBank); // FIXME: Correct?
+ MI.eraseFromParent();
+ return;
+ }
+
LLT DstTy = MRI.getType(DstReg);
if (DstTy.isScalar() &&
SrcBank != &AMDGPU::SGPRRegBank &&
@@ -2609,7 +2655,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
SrcBank->getID() == AMDGPU::SGPRRegBankID;
// TODO: Should s16 select be legal?
- LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
+ LLT SelType = UseSel64 ? LLT::scalar(64) : S32;
auto True = B.buildConstant(SelType, Signed ? -1 : 1);
auto False = B.buildConstant(SelType, 0);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index a1013f3803e781a..cdb817a00911016 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -640,7 +640,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX7-NEXT: s_cmp_lt_i32 s1, 0
; GFX7-NEXT: s_cselect_b32 s1, 1, 0
; GFX7-NEXT: s_xor_b32 s0, s1, s0
-; GFX7-NEXT: s_and_b32 s0, s0, 1
; GFX7-NEXT: s_add_i32 s0, s2, s0
; GFX7-NEXT: ; return to shader part epilog
;
@@ -652,7 +651,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX8-NEXT: s_cmp_lt_i32 s1, 0
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
; GFX8-NEXT: s_xor_b32 s0, s1, s0
-; GFX8-NEXT: s_and_b32 s0, s0, 1
; GFX8-NEXT: s_add_i32 s0, s2, s0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -664,7 +662,6 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX9-NEXT: s_cmp_lt_i32 s1, 0
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
; GFX9-NEXT: s_xor_b32 s0, s1, s0
-; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_add_i32 s0, s2, s0
; GFX9-NEXT: ; return to shader part epilog
%saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -749,8 +746,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX7-NEXT: s_cselect_b32 s3, 1, 0
; GFX7-NEXT: s_xor_b32 s0, s2, s0
; GFX7-NEXT: s_xor_b32 s1, s3, s1
-; GFX7-NEXT: s_and_b32 s0, s0, 1
-; GFX7-NEXT: s_and_b32 s1, s1, 1
; GFX7-NEXT: s_add_i32 s0, s4, s0
; GFX7-NEXT: s_add_i32 s1, s5, s1
; GFX7-NEXT: ; return to shader part epilog
@@ -769,8 +764,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
; GFX8-NEXT: s_xor_b32 s0, s2, s0
; GFX8-NEXT: s_xor_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, s1, 1
; GFX8-NEXT: s_add_i32 s0, s4, s0
; GFX8-NEXT: s_add_i32 s1, s5, s1
; GFX8-NEXT: ; return to shader part epilog
@@ -789,8 +782,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
; GFX9-NEXT: s_xor_b32 s0, s2, s0
; GFX9-NEXT: s_xor_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s0, s4, s0
; GFX9-NEXT: s_add_i32 s1, s5, s1
; GFX9-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 043e69abaeef2d3..38d49add27df132 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -66,7 +66,6 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: .LBB2_2: ; %Flow
; GCN-NEXT: s_xor_b32 s2, s4, 1
-; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
; GCN-NEXT: ; %bb.3: ; %.zero
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2ba..138eb26063f67c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -36,7 +36,6 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB0_2: ; %Flow
; GFX9-NEXT: s_xor_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc1 .LBB0_4
; GFX9-NEXT: ; %bb.3: ; %bb0
@@ -121,7 +120,6 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB1_2: ; %Flow
; GFX9-NEXT: s_xor_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc1 .LBB1_4
; GFX9-NEXT: ; %bb.3: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index eb3f74be71de017..cef9e86c49e513a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -860,7 +860,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: v_mov_b32_e32 v1, s12
; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1
; GFX7-NEXT: s_mul_i32 s18, s1, s8
-; GFX7-NEXT: s_cselect_b32 s25, 1, 0
+; GFX7-NEXT: s_cselect_b32 s26, 1, 0
; GFX7-NEXT: s_add_u32 s18, s18, s17
; GFX7-NEXT: s_addc_u32 s17, s23, s22
; GFX7-NEXT: v_mov_b32_e32 v4, s11
@@ -871,33 +871,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_mul_i32 s24, s1, s11
; GFX7-NEXT: v_readfirstlane_b32 s28, v3
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: v_readfirstlane_b32 s27, v5
+; GFX7-NEXT: v_readfirstlane_b32 s25, v5
; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9
-; GFX7-NEXT: s_cselect_b32 s26, 1, 0
+; GFX7-NEXT: s_cselect_b32 s27, 1, 0
; GFX7-NEXT: s_add_u32 s24, s24, s22
-; GFX7-NEXT: s_addc_u32 s23, s27, s23
+; GFX7-NEXT: s_addc_u32 s23, s25, s23
; GFX7-NEXT: v_readfirstlane_b32 s29, v5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8
-; GFX7-NEXT: s_mul_i32 s27, s2, s10
+; GFX7-NEXT: s_mul_i32 s25, s2, s10
; GFX7-NEXT: s_cselect_b32 s22, 1, 0
-; GFX7-NEXT: s_add_u32 s24, s27, s24
+; GFX7-NEXT: s_add_u32 s24, s25, s24
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10
-; GFX7-NEXT: s_addc_u32 s27, s28, s23
+; GFX7-NEXT: s_addc_u32 s25, s28, s23
; GFX7-NEXT: s_mul_i32 s28, s3, s9
; GFX7-NEXT: s_cselect_b32 s23, 1, 0
; GFX7-NEXT: s_add_u32 s28, s28, s24
; GFX7-NEXT: v_readfirstlane_b32 s30, v6
; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4
-; GFX7-NEXT: s_addc_u32 s27, s29, s27
+; GFX7-NEXT: s_addc_u32 s25, s29, s25
; GFX7-NEXT: s_mul_i32 s29, s4, s8
; GFX7-NEXT: s_cselect_b32 s24, 1, 0
; GFX7-NEXT: s_add_u32 s28, s29, s28
; GFX7-NEXT: v_readfirstlane_b32 s33, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9
-; GFX7-NEXT: s_addc_u32 s27, s30, s27
+; GFX7-NEXT: s_addc_u32 s29, s30, s25
; GFX7-NEXT: s_mul_i32 s30, s16, s11
-; GFX7-NEXT: s_cselect_b32 s29, 1, 0
+; GFX7-NEXT: s_cselect_b32 s25, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s31, v6
; GFX7-NEXT: s_add_u32 s19, s30, s19
; GFX7-NEXT: s_addc_u32 s28, s31, s28
@@ -919,84 +919,84 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_addc_u32 s28, s35, s28
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT: s_cselect_b32 s34, 1, 0
-; GFX7-NEXT: s_cmp_lg_u32 s26, 0
-; GFX7-NEXT: s_addc_u32 s19, s25, s19
+; GFX7-NEXT: s_cmp_lg_u32 s27, 0
+; GFX7-NEXT: s_addc_u32 s19, s26, s19
; GFX7-NEXT: v_mov_b32_e32 v2, s13
-; GFX7-NEXT: s_cselect_b32 s25, 1, 0
+; GFX7-NEXT: s_cselect_b32 s26, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2
; GFX7-NEXT: s_addc_u32 s20, s20, 0
-; GFX7-NEXT: v_readfirstlane_b32 s26, v0
+; GFX7-NEXT: v_readfirstlane_b32 s27, v0
; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1
-; GFX7-NEXT: s_cmp_lg_u32 s25, 0
+; GFX7-NEXT: s_cmp_lg_u32 s26, 0
; GFX7-NEXT: s_addc_u32 s20, s20, s28
-; GFX7-NEXT: s_mul_i32 s25, s16, s14
+; GFX7-NEXT: s_mul_i32 s26, s16, s14
; GFX7-NEXT: s_mul_i32 s28, s1, s13
; GFX7-NEXT: s_cselect_b32 s21, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11
; GFX7-NEXT: s_mul_i32 s28, s2, s12
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10
; GFX7-NEXT: s_mul_i32 s28, s3, s11
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9
; GFX7-NEXT: s_mul_i32 s28, s4, s10
-; GFX7-NEXT: s_add_u32 s25, s28, s25
+; GFX7-NEXT: s_add_u32 s26, s28, s26
; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8
; GFX7-NEXT: s_mul_i32 s28, s5, s9
-; GFX7-NEXT: s_add_u32 s25, s28, s25
+; GFX7-NEXT: s_add_u32 s26, s28, s26
; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2
; GFX7-NEXT: v_readfirstlane_b32 s36, v1
; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: s_mul_i32 s28, s6, s8
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
-; GFX7-NEXT: s_add_u32 s25, s28, s25
-; GFX7-NEXT: s_addc_u32 s26, s35, s26
+; GFX7-NEXT: s_add_u32 s26, s28, s26
+; GFX7-NEXT: s_addc_u32 s27, s35, s27
; GFX7-NEXT: s_mul_i32 s28, s16, s13
; GFX7-NEXT: v_readfirstlane_b32 s35, v2
-; GFX7-NEXT: s_add_u32 s27, s28, s27
+; GFX7-NEXT: s_add_u32 s28, s28, s29
; GFX7-NEXT: v_readfirstlane_b32 s37, v1
; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10
-; GFX7-NEXT: s_addc_u32 s25, s35, s25
+; GFX7-NEXT: s_addc_u32 s26, s35, s26
; GFX7-NEXT: s_mul_i32 s35, s1, s12
-; GFX7-NEXT: s_cselect_b32 s28, 1, 0
-; GFX7-NEXT: s_add_u32 s27, s35, s27
-; GFX7-NEXT: s_addc_u32 s25, s36, s25
+; GFX7-NEXT: s_cselect_b32 s29, 1, 0
+; GFX7-NEXT: s_add_u32 s28, s35, s28
+; GFX7-NEXT: s_addc_u32 s26, s36, s26
; GFX7-NEXT: s_mul_i32 s36, s2, s11
; GFX7-NEXT: s_cselect_b32 s35, 1, 0
-; GFX7-NEXT: s_add_u32 s27, s36, s27
+; GFX7-NEXT: s_add_u32 s28, s36, s28
; GFX7-NEXT: v_readfirstlane_b32 s38, v1
; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9
-; GFX7-NEXT: s_addc_u32 s25, s37, s25
+; GFX7-NEXT: s_addc_u32 s26, s37, s26
; GFX7-NEXT: s_mul_i32 s37, s3, s10
; GFX7-NEXT: s_cselect_b32 s36, 1, 0
-; GFX7-NEXT: s_add_u32 s27, s37, s27
+; GFX7-NEXT: s_add_u32 s28, s37, s28
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8
-; GFX7-NEXT: s_addc_u32 s25, s38, s25
+; GFX7-NEXT: s_addc_u32 s26, s38, s26
; GFX7-NEXT: s_mul_i32 s38, s4, s9
; GFX7-NEXT: s_cselect_b32 s37, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s39, v1
-; GFX7-NEXT: s_add_u32 s27, s38, s27
-; GFX7-NEXT: s_addc_u32 s25, s39, s25
+; GFX7-NEXT: s_add_u32 s28, s38, s28
+; GFX7-NEXT: s_addc_u32 s26, s39, s26
; GFX7-NEXT: s_mul_i32 s39, s5, s8
; GFX7-NEXT: s_cselect_b32 s38, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s40, v0
-; GFX7-NEXT: s_add_u32 s27, s39, s27
-; GFX7-NEXT: s_addc_u32 s25, s40, s25
+; GFX7-NEXT: s_add_u32 s28, s39, s28
+; GFX7-NEXT: s_addc_u32 s26, s40, s26
; GFX7-NEXT: s_cselect_b32 s39, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s31, 0
; GFX7-NEXT: s_addc_u32 s30, s30, 0
@@ -1005,18 +1005,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_cmp_lg_u32 s34, 0
; GFX7-NEXT: s_addc_u32 s30, s30, 0
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
-; GFX7-NEXT: s_addc_u32 s21, s30, s27
-; GFX7-NEXT: s_cselect_b32 s27, 1, 0
+; GFX7-NEXT: s_addc_u32 s21, s30, s28
+; GFX7-NEXT: s_cselect_b32 s28, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s23, 0
; GFX7-NEXT: s_addc_u32 s22, s22, 0
; GFX7-NEXT: s_cmp_lg_u32 s24, 0
; GFX7-NEXT: s_addc_u32 s22, s22, 0
-; GFX7-NEXT: s_cmp_lg_u32 s29, 0
+; GFX7-NEXT: s_cmp_lg_u32 s25, 0
; GFX7-NEXT: s_addc_u32 s22, s22, 0
-; GFX7-NEXT: s_cmp_lg_u32 s27, 0
-; GFX7-NEXT: s_addc_u32 s22, s22, s25
+; GFX7-NEXT: s_cmp_lg_u32 s28, 0
+; GFX7-NEXT: s_addc_u32 s22, s22, s26
; GFX7-NEXT: s_mul_i32 s16, s16, s15
-; GFX7-NEXT: s_addc_u32 s15, s26, s16
+; GFX7-NEXT: s_addc_u32 s15, s27, s16
; GFX7-NEXT: s_mul_i32 s1, s1, s14
; GFX7-NEXT: s_cmp_lg_u32 s39, 0
; GFX7-NEXT: s_addc_u32 s1, s15, s1
@@ -1033,7 +1033,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/68858
More information about the llvm-commits
mailing list