[llvm] [SelectionDAG] Support integer types with multiple registers in ComputePHILiveOutRegInfo. (PR #172081)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 12:17:24 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Craig Topper (topperc)
<details>
<summary>Changes</summary>
PHIs that are larger than a legal integer type are split into multiple virtual registers that are numbered sequentially. We can propagate the known bits for each of these registers individually.
Big endian is not supported yet because the register order needs to be reversed.
Fixes #<!-- -->171671
---
Patch is 357.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172081.diff
20 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp (+77-61)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+96-102)
- (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+936-968)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+64-70)
- (modified) llvm/test/CodeGen/ARM/cttz.ll (+51-49)
- (modified) llvm/test/CodeGen/NVPTX/i128.ll (+208-212)
- (modified) llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll (+6-7)
- (modified) llvm/test/CodeGen/RISCV/idiv_large.ll (+1205-1252)
- (modified) llvm/test/CodeGen/RISCV/rv32xtheadbb.ll (+4-5)
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+3-4)
- (modified) llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll (+3-5)
- (modified) llvm/test/CodeGen/X86/bsf.ll (+47-55)
- (modified) llvm/test/CodeGen/X86/bsr.ll (+32-33)
- (modified) llvm/test/CodeGen/X86/ctlo.ll (+5-7)
- (modified) llvm/test/CodeGen/X86/ctlz.ll (+5-7)
- (modified) llvm/test/CodeGen/X86/cttz.ll (+5-13)
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll (+204-219)
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll (+200-195)
- (modified) llvm/test/CodeGen/X86/scheduler-backtracking.ll (+70-90)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll (+1-5)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index f4edbc74a3cc4..e73743ecbc9fa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -451,7 +451,9 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
"PHIs with non-vector integer types should have a single VT.");
EVT IntVT = ValueVTs[0];
- if (TLI->getNumRegisters(PN->getContext(), IntVT) != 1)
+ unsigned NumRegisters = TLI->getNumRegisters(PN->getContext(), IntVT);
+ // FIXME: Support multiple registers for big endian targets.
+ if (NumRegisters != 1 && MF->getDataLayout().isBigEndian())
return;
IntVT = TLI->getRegisterType(PN->getContext(), IntVT);
unsigned BitWidth = IntVT.getSizeInBits();
@@ -460,81 +462,95 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
if (It == ValueMap.end())
return;
- Register DestReg = It->second;
- if (DestReg == 0)
+ Register BaseReg = It->second;
+ if (!BaseReg)
return;
- assert(DestReg.isVirtual() && "Expected a virtual reg");
- LiveOutRegInfo.grow(DestReg);
- LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
-
- Value *V = PN->getIncomingValue(0);
- if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
- DestLOI.NumSignBits = 1;
- DestLOI.Known = KnownBits(BitWidth);
- return;
- }
-
- if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
- APInt Val;
- if (TLI->signExtendConstant(CI))
- Val = CI->getValue().sext(BitWidth);
- else
- Val = CI->getValue().zext(BitWidth);
- DestLOI.NumSignBits = Val.getNumSignBits();
- DestLOI.Known = KnownBits::makeConstant(Val);
- } else {
- assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
- "CopyToReg node was created.");
- Register SrcReg = ValueMap[V];
- if (!SrcReg.isVirtual()) {
- DestLOI.IsValid = false;
- return;
- }
- const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
- if (!SrcLOI) {
- DestLOI.IsValid = false;
- return;
- }
- DestLOI = *SrcLOI;
- }
+ assert(BaseReg.isVirtual() && "Expected a virtual reg");
- assert(DestLOI.Known.Zero.getBitWidth() == BitWidth &&
- DestLOI.Known.One.getBitWidth() == BitWidth &&
- "Masks should have the same bit width as the type.");
+ for (unsigned RegIdx = 0; RegIdx < NumRegisters; ++RegIdx) {
+ // Split registers are assigned sequentially.
+ Register DestReg = BaseReg.id() + RegIdx;
+ LiveOutRegInfo.grow(DestReg);
+ LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
- for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *V = PN->getIncomingValue(i);
+ Value *V = PN->getIncomingValue(0);
if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
DestLOI.NumSignBits = 1;
DestLOI.Known = KnownBits(BitWidth);
- return;
+ continue;
}
if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
APInt Val;
if (TLI->signExtendConstant(CI))
- Val = CI->getValue().sext(BitWidth);
+ Val = CI->getValue().sext(BitWidth * NumRegisters);
else
- Val = CI->getValue().zext(BitWidth);
- DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits());
- DestLOI.Known = DestLOI.Known.intersectWith(KnownBits::makeConstant(Val));
- continue;
+ Val = CI->getValue().zext(BitWidth * NumRegisters);
+ APInt Extracted = Val.extractBits(BitWidth, BitWidth * RegIdx);
+ DestLOI.NumSignBits = Extracted.getNumSignBits();
+ DestLOI.Known = KnownBits::makeConstant(Extracted);
+ } else {
+ assert(ValueMap.count(V) &&
+ "V should have been placed in ValueMap when its"
+ "CopyToReg node was created.");
+ Register SrcReg = ValueMap[V];
+ if (!SrcReg.isVirtual()) {
+ DestLOI.IsValid = false;
+ continue;
+ }
+ // Split registers are assigned sequentially.
+ SrcReg = SrcReg.id() + RegIdx;
+ const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+ if (!SrcLOI) {
+ DestLOI.IsValid = false;
+ continue;
+ }
+ DestLOI = *SrcLOI;
}
- assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
- "its CopyToReg node was created.");
- Register SrcReg = ValueMap[V];
- if (!SrcReg.isVirtual()) {
- DestLOI.IsValid = false;
- return;
- }
- const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
- if (!SrcLOI) {
- DestLOI.IsValid = false;
- return;
+ assert(DestLOI.Known.Zero.getBitWidth() == BitWidth &&
+ DestLOI.Known.One.getBitWidth() == BitWidth &&
+ "Masks should have the same bit width as the type.");
+
+ for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *V = PN->getIncomingValue(i);
+ if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+ DestLOI.NumSignBits = 1;
+ DestLOI.Known = KnownBits(BitWidth);
+ break;
+ }
+
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+ APInt Val;
+ if (TLI->signExtendConstant(CI))
+ Val = CI->getValue().sext(BitWidth * NumRegisters);
+ else
+ Val = CI->getValue().zext(BitWidth * NumRegisters);
+ APInt Extracted = Val.extractBits(BitWidth, BitWidth * RegIdx);
+ DestLOI.NumSignBits =
+ std::min(DestLOI.NumSignBits, Extracted.getNumSignBits());
+ DestLOI.Known =
+ DestLOI.Known.intersectWith(KnownBits::makeConstant(Extracted));
+ continue;
+ }
+
+ assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
+ "its CopyToReg node was created.");
+ Register SrcReg = ValueMap[V];
+ if (!SrcReg.isVirtual()) {
+ DestLOI.IsValid = false;
+ break;
+ }
+ // Split registers are assigned sequentially.
+ SrcReg = SrcReg.id() + RegIdx;
+ const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+ if (!SrcLOI) {
+ DestLOI.IsValid = false;
+ break;
+ }
+ DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
+ DestLOI.Known = DestLOI.Known.intersectWith(SrcLOI->Known);
}
- DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
- DestLOI.Known = DestLOI.Known.intersectWith(SrcLOI->Known);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index dd9a013d37203..d5b5ab6e457f9 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -14,7 +14,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: v_ashrrev_i32_e32 v16, 31, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc
@@ -24,13 +24,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v4, v0, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v21, v5, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v22, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: v_or_b32_e32 v3, v20, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v21, v0
+; GFX9-NEXT: v_or_b32_e32 v3, v21, v1
+; GFX9-NEXT: v_or_b32_e32 v2, v22, v0
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v3, v9, v11
; GFX9-NEXT: v_or_b32_e32 v2, v8, v10
@@ -39,9 +39,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_add_u32_e32 v2, 32, v2
; GFX9-NEXT: v_ffbh_u32_e32 v3, v1
; GFX9-NEXT: v_min_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_ffbh_u32_e32 v3, v21
+; GFX9-NEXT: v_ffbh_u32_e32 v3, v22
; GFX9-NEXT: v_add_u32_e32 v3, 32, v3
-; GFX9-NEXT: v_ffbh_u32_e32 v4, v20
+; GFX9-NEXT: v_ffbh_u32_e32 v4, v21
; GFX9-NEXT: v_min_u32_e32 v3, v3, v4
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 64, v3
@@ -69,18 +69,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v18, v16
+; GFX9-NEXT: v_mov_b32_e32 v19, v17
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v19, v17
+; GFX9-NEXT: v_mov_b32_e32 v20, v18
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX9-NEXT: v_xor_b32_e32 v6, 0x7f, v2
-; GFX9-NEXT: v_or_b32_e32 v7, v3, v5
; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
+; GFX9-NEXT: v_or_b32_e32 v7, v3, v5
; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
@@ -92,14 +92,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_6
; GFX9-NEXT: ; %bb.1: ; %udiv-bb1
-; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v4, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v23, vcc, 1, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v4, vcc
; GFX9-NEXT: v_sub_u32_e32 v7, 0x7f, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v5, vcc
; GFX9-NEXT: v_sub_u32_e32 v12, 64, v7
-; GFX9-NEXT: v_or_b32_e32 v4, v23, v25
-; GFX9-NEXT: v_or_b32_e32 v3, v22, v24
+; GFX9-NEXT: v_or_b32_e32 v4, v24, v26
+; GFX9-NEXT: v_or_b32_e32 v3, v23, v25
; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, v[10:11]
; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[8:9]
; GFX9-NEXT: v_sub_u32_e32 v2, 63, v2
@@ -113,99 +113,96 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, v[8:9]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_5
; GFX9-NEXT: ; %bb.2: ; %udiv-preheader
-; GFX9-NEXT: v_sub_u32_e32 v12, 64, v22
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[8:9]
+; GFX9-NEXT: v_sub_u32_e32 v12, 64, v23
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v23, v[8:9]
; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
-; GFX9-NEXT: v_or_b32_e32 v12, v6, v12
-; GFX9-NEXT: v_subrev_u32_e32 v6, 64, v22
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT: v_or_b32_e32 v14, v6, v12
+; GFX9-NEXT: v_subrev_u32_e32 v6, 64, v23
; GFX9-NEXT: v_or_b32_e32 v13, v7, v13
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v6, v[10:11]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v12, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v21
-; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v20, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v7, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v6, v14, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v23, v[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v9, v8, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v14, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v6, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, -1, v22
+; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v21, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, -1, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-NEXT: v_mov_b32_e32 v14, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NEXT: v_mov_b32_e32 v15, 0
; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: v_mov_b32_e32 v15, 0
+; GFX9-NEXT: v_mov_b32_e32 v16, 0
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: .LBB0_3: ; %udiv-do-while
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GFX9-NEXT: v_or_b32_e32 v4, v14, v30
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
-; GFX9-NEXT: v_or_b32_e32 v5, v15, v31
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v3
-; GFX9-NEXT: v_or_b32_e32 v8, v8, v15
-; GFX9-NEXT: v_or_b32_e32 v10, v10, v14
-; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14
-; GFX9-NEXT: v_and_b32_e32 v14, v30, v21
+; GFX9-NEXT: v_or_b32_e32 v4, v15, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v12
+; GFX9-NEXT: v_lshlrev_b64 v[11:12], 1, v[11:12]
+; GFX9-NEXT: v_or_b32_e32 v5, v16, v31
+; GFX9-NEXT: v_lshlrev_b64 v[13:14], 1, v[13:14]
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v3
+; GFX9-NEXT: v_or_b32_e32 v11, v11, v16
+; GFX9-NEXT: v_or_b32_e32 v13, v13, v15
+; GFX9-NEXT: v_sub_co_u32_e32 v15, vcc, v10, v11
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v27, v12, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v28, v13, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v29, v14, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v15
+; GFX9-NEXT: v_and_b32_e32 v15, v30, v22
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14
-; GFX9-NEXT: v_and_b32_e32 v14, v30, v20
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc
-; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12
+; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, v11, v15
+; GFX9-NEXT: v_and_b32_e32 v15, v30, v21
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX9-NEXT: v_or3_b32 v2, v2, v6, v8
; GFX9-NEXT: v_and_b32_e32 v6, v30, v0
-; GFX9-NEXT: v_and_b32_e32 v14, v30, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v6, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
+; GFX9-NEXT: v_and_b32_e32 v15, v30, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v13, v6, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v15, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v23, vcc, -1, v23
; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
-; GFX9-NEXT: v_or_b32_e32 v14, v22, v24
+; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v26, vcc
; GFX9-NEXT: v_or_b32_e32 v15, v23, v25
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT: v_or_b32_e32 v16, v24, v26
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[15:16]
; GFX9-NEXT: v_and_b32_e32 v6, 1, v30
-; GFX9-NEXT: v_mov_b32_e32 v15, v7
-; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v16, v7
+; GFX9-NEXT: v_or3_b32 v3, v3, 0, v9
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v15, v6
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB0_3
; GFX9-NEXT: ; %bb.4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB0_5: ; %Flow2
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[2:3]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT: v_or3_b32 v13, v3, 0, v13
-; GFX9-NEXT: v_or3_b32 v12, v2, v4, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GFX9-NEXT: v_or_b32_e32 v12, v12, v2
; GFX9-NEXT: v_or_b32_e32 v7, v7, v1
; GFX9-NEXT: v_or_b32_e32 v6, v6, v0
; GFX9-NEXT: .LBB0_6: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT: v_xor_b32_e32 v2, v17, v16
-; GFX9-NEXT: v_xor_b32_e32 v3, v19, v18
+; GFX9-NEXT: v_xor_b32_e32 v2, v18, v17
+; GFX9-NEXT: v_xor_b32_e32 v3, v20, v19
; GFX9-NEXT: v_xor_b32_e32 v0, v6, v2
; GFX9-NEXT: v_xor_b32_e32 v1, v7, v3
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
@@ -2268,21 +2265,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v8, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v8, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13]
-; GFX9-NEXT: v_or_b32_e32 v10, v13, v15
+; GFX9-NEXT: v_or_b32_e32 v11, v13, v15
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_xor_b32_e32 v9, 0x7f, v12
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT: v_or_b32_e32 v9, v9, v14
+; GFX9-NEXT: v_xor_b32_e32 v8, 0x7f, v12
+; GFX9-NEXT: v_or_b32_e32 v10, v8, v14
; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5]
; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
@@ -2310,12 +2307,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15
; GFX9-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-NEXT: v_mov_b32_e32 v14, 0
; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_mov_b32_e32 v15, 0
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -2329,21 +2324,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v18
; GFX9-NEXT: v_or_b32_e32 v15, v13, v15
; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v18, v[2:3]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v13, v1,...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/172081
More information about the llvm-commits
mailing list