[llvm] [ExpandLargeDivRem] Scalarize vector types. (PR #86959)

Bevin Hansson via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 28 07:47:46 PDT 2024


https://github.com/bevin-hansson created https://github.com/llvm/llvm-project/pull/86959

expand-large-divrem cannot handle vector types.
If overly large vector element types survive into
isel, they will likely be scalarized there, but since
isel cannot handle scalar integer types of that size,
it will assert.

Handle vector types in expand-large-divrem by
scalarizing them and then expanding the scalar type
operation. For large vectors, this results in a
*massive* code expansion, but it's better than
asserting.


>From 55fa26d38f22c32c3017feec1a546fffa54f32ca Mon Sep 17 00:00:00 2001
From: Bevin Hansson <bevin.hansson at ericsson.com>
Date: Thu, 28 Mar 2024 13:42:53 +0100
Subject: [PATCH] [ExpandLargeDivRem] Scalarize vector types.

expand-large-divrem cannot handle vector types.
If overly large vector element types survive into
isel, they will likely be scalarized there, but since
isel cannot handle scalar integer types of that size,
it will assert.

Handle vector types in expand-large-divrem by
scalarizing them and then expanding the scalar type
operation. For large vectors, this results in a
*massive* code expansion, but it's better than
asserting.
---
 llvm/lib/CodeGen/ExpandLargeDivRem.cpp        |   44 +-
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        | 3233 ++++++++++++++++-
 .../ExpandLargeDivRem/X86/vector.ll           |  525 +++
 3 files changed, 3794 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Transforms/ExpandLargeDivRem/X86/vector.ll

diff --git a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
index 973c814604b389..9a7cabacad7d2f 100644
--- a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
@@ -54,8 +54,34 @@ static bool isSigned(unsigned int Opcode) {
   return Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
 }
 
+static void scalarize(BinaryOperator *BO,
+                      SmallVectorImpl<BinaryOperator *> &Replace) {
+  VectorType *VTy = cast<VectorType>(BO->getType());
+  assert(!VTy->isScalableTy() && "Tried to scalarize scalable vector!");
+
+  IRBuilder<> Builder(BO);
+
+  unsigned NumElements = VTy->getElementCount().getKnownMinValue();
+  Value *Result = nullptr;
+  for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+    Value *LHS = Builder.CreateExtractElement(BO->getOperand(0), Idx);
+    Value *RHS = Builder.CreateExtractElement(BO->getOperand(1), Idx);
+    Value *Op = Builder.CreateBinOp(BO->getOpcode(), LHS, RHS);
+    Result = Builder.CreateInsertElement(
+        Result ? Result : PoisonValue::get(VTy), Op, Idx);
+    if (auto *NewBO = dyn_cast<BinaryOperator>(Op)) {
+      NewBO->copyIRFlags(Op, true);
+      Replace.push_back(NewBO);
+    }
+  }
+  BO->replaceAllUsesWith(Result);
+  BO->dropAllReferences();
+  BO->eraseFromParent();
+}
+
 static bool runImpl(Function &F, const TargetLowering &TLI) {
   SmallVector<BinaryOperator *, 4> Replace;
+  SmallVector<BinaryOperator *, 4> ReplaceVector;
   bool Modified = false;
 
   unsigned MaxLegalDivRemBitWidth = TLI.getMaxDivRemBitWidthSupported();
@@ -71,16 +97,23 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
     case Instruction::SDiv:
     case Instruction::URem:
     case Instruction::SRem: {
-      // TODO: This doesn't handle vectors.
-      auto *IntTy = dyn_cast<IntegerType>(I.getType());
+      // TODO: This pass doesn't handle scalable vectors.
+      if (I.getOperand(0)->getType()->isScalableTy())
+        continue;
+
+      auto *IntTy = dyn_cast<IntegerType>(I.getType()->getScalarType());
       if (!IntTy || IntTy->getIntegerBitWidth() <= MaxLegalDivRemBitWidth)
         continue;
 
       // The backend has peephole optimizations for powers of two.
+      // TODO: We don't consider vectors here.
       if (isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode())))
         continue;
 
-      Replace.push_back(&cast<BinaryOperator>(I));
+      if (I.getOperand(0)->getType()->isVectorTy())
+        ReplaceVector.push_back(&cast<BinaryOperator>(I));
+      else
+        Replace.push_back(&cast<BinaryOperator>(I));
       Modified = true;
       break;
     }
@@ -89,6 +122,11 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
     }
   }
 
+  while (!ReplaceVector.empty()) {
+    BinaryOperator *BO = ReplaceVector.pop_back_val();
+    scalarize(BO, Replace);
+  }
+
   if (Replace.empty())
     return false;
 
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 46e2632e45a190..16a03badcb1329 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -1,25 +1,3248 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
-
-; SDAG-ERR: LLVM ERROR: unsupported libcall legalization
-; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_v2i128_vv)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GISEL %s
 
 define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_sdiv_v2i128_vv:
+; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_ashrrev_i32_e32 v24, 31, v3
+; SDAG-NEXT:    v_ashrrev_i32_e32 v25, 31, v11
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    v_mov_b32_e32 v26, v24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v25
+; SDAG-NEXT:    v_xor_b32_e32 v17, v24, v3
+; SDAG-NEXT:    v_xor_b32_e32 v18, v24, v2
+; SDAG-NEXT:    v_xor_b32_e32 v1, v24, v1
+; SDAG-NEXT:    v_xor_b32_e32 v0, v24, v0
+; SDAG-NEXT:    v_xor_b32_e32 v19, v25, v11
+; SDAG-NEXT:    v_xor_b32_e32 v20, v25, v10
+; SDAG-NEXT:    v_xor_b32_e32 v9, v25, v9
+; SDAG-NEXT:    v_xor_b32_e32 v8, v25, v8
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v0, v24
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v1, v24, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v0, v2
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v18, v24, vcc
+; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], 32, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v3
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v17, v24, vcc
+; SDAG-NEXT:    v_or_b32_e32 v0, v2, v10
+; SDAG-NEXT:    v_ffbh_u32_e32 v17, v10
+; SDAG-NEXT:    v_min_u32_e32 v18, v1, v18
+; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, v8, v25
+; SDAG-NEXT:    v_or_b32_e32 v1, v3, v11
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], 32, v17
+; SDAG-NEXT:    v_ffbh_u32_e32 v17, v11
+; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 64, v18
+; SDAG-NEXT:    v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v9, v25, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v28
+; SDAG-NEXT:    v_min_u32_e32 v8, v8, v17
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, 0, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v20, v25, vcc
+; SDAG-NEXT:    v_add_i32_e64 v9, s[8:9], 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v29
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v8, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v19, v25, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v28, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v19, v0
+; SDAG-NEXT:    v_min_u32_e32 v20, v9, v20
+; SDAG-NEXT:    v_or_b32_e32 v9, v29, v1
+; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 32, v19
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v1
+; SDAG-NEXT:    v_add_i32_e32 v20, vcc, 64, v20
+; SDAG-NEXT:    v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_min_u32_e32 v8, v19, v21
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v22, 0, s[6:7]
+; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v8, s[6:7]
+; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v8, v18
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v9, v17, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v17, 0x7f, v8
+; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v16, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v16, vcc
+; SDAG-NEXT:    v_or_b32_e32 v16, v17, v18
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v9, v19
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v20
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v11, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v10, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v3, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v2, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_6
+; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v8
+; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v8
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[2:3], v20
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v18, v30, v32
+; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v8
+; SDAG-NEXT:    v_or_b32_e32 v19, v31, v33
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[10:11], v34
+; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v34
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[2:3], v34
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_lshr_b64 v[18:19], v[2:3], v35
+; SDAG-NEXT:    v_or_b32_e32 v9, v9, v19
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v18
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v21, v9, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v8, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v22, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_5
+; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT:    v_lshr_b64 v[16:17], v[2:3], v30
+; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
+; SDAG-NEXT:    v_lshr_b64 v[37:38], v[10:11], v30
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_mov_b32_e32 v23, 0
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_lshl_b64 v[48:49], v[10:11], v35
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[10:11], v36
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v29, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v17, v49
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v48
+; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v11, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v10, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v38, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v37, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:  .LBB0_3: ; %udiv-do-while3
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v3
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v9
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v21
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v16
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v38
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v39
+; SDAG-NEXT:    v_or_b32_e32 v9, v19, v9
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v34, v2
+; SDAG-NEXT:    v_or_b32_e32 v8, v18, v8
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v35, v3, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v36, v10, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v37, v11, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v38, 31, v16
+; SDAG-NEXT:    v_and_b32_e32 v39, v38, v28
+; SDAG-NEXT:    v_and_b32_e32 v48, v38, v29
+; SDAG-NEXT:    v_and_b32_e32 v49, v38, v0
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v38
+; SDAG-NEXT:    v_and_b32_e32 v38, v38, v1
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v39
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v48, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v10, v49, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v38, vcc
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT:    v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT:    v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT:    v_or_b32_e32 v21, v23, v21
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v20, v22, v20
+; SDAG-NEXT:    v_mov_b32_e32 v23, v17
+; SDAG-NEXT:    v_mov_b32_e32 v22, v16
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB0_3
+; SDAG-NEXT:  ; %bb.4: ; %Flow13
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB0_5: ; %Flow14
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[8:9], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v21
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v8
+; SDAG-NEXT:    v_or_b32_e32 v20, v19, v1
+; SDAG-NEXT:    v_or_b32_e32 v21, v17, v3
+; SDAG-NEXT:    v_or_b32_e32 v17, v18, v0
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v2
+; SDAG-NEXT:  .LBB0_6: ; %Flow16
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_ashrrev_i32_e32 v18, 31, v7
+; SDAG-NEXT:    v_ashrrev_i32_e32 v19, 31, v15
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    v_mov_b32_e32 v22, v18
+; SDAG-NEXT:    v_mov_b32_e32 v23, v19
+; SDAG-NEXT:    v_xor_b32_e32 v0, v18, v7
+; SDAG-NEXT:    v_xor_b32_e32 v1, v18, v6
+; SDAG-NEXT:    v_xor_b32_e32 v3, v18, v5
+; SDAG-NEXT:    v_xor_b32_e32 v2, v18, v4
+; SDAG-NEXT:    v_xor_b32_e32 v6, v19, v15
+; SDAG-NEXT:    v_xor_b32_e32 v7, v19, v14
+; SDAG-NEXT:    v_xor_b32_e32 v8, v19, v13
+; SDAG-NEXT:    v_xor_b32_e32 v10, v19, v12
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v18
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v18, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v2
+; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v1, v18, vcc
+; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v3
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v0, v18, vcc
+; SDAG-NEXT:    v_or_b32_e32 v0, v2, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v12, v4
+; SDAG-NEXT:    v_min_u32_e32 v11, v1, v11
+; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, v10, v19
+; SDAG-NEXT:    v_or_b32_e32 v1, v3, v5
+; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v12
+; SDAG-NEXT:    v_ffbh_u32_e32 v12, v5
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 64, v11
+; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v8, v19, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v28
+; SDAG-NEXT:    v_min_u32_e32 v8, v10, v12
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v13, 0, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v7, v19, vcc
+; SDAG-NEXT:    v_add_i32_e64 v7, s[8:9], 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v12, v29
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v11, v8, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v6, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v6, v28, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v0
+; SDAG-NEXT:    v_min_u32_e32 v12, v7, v12
+; SDAG-NEXT:    v_or_b32_e32 v7, v29, v1
+; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
+; SDAG-NEXT:    v_ffbh_u32_e32 v13, v1
+; SDAG-NEXT:    v_add_i32_e32 v12, vcc, 64, v12
+; SDAG-NEXT:    v_addc_u32_e64 v14, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_min_u32_e32 v6, v11, v13
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v14, 0, s[6:7]
+; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s[6:7]
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v10, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v10, 0x7f, v6
+; SDAG-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v8
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v11, v7, v9
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_and_b32_e32 v10, 1, v12
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v5, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v4, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v3, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_12
+; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v6
+; SDAG-NEXT:    v_sub_i32_e64 v12, s[4:5], 63, v6
+; SDAG-NEXT:    v_mov_b32_e32 v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v7, vcc
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[2:3], v12
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v8, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v7, v30, v32
+; SDAG-NEXT:    v_sub_i32_e32 v9, vcc, 0x7f, v6
+; SDAG-NEXT:    v_or_b32_e32 v8, v31, v33
+; SDAG-NEXT:    v_lshl_b64 v[14:15], v[4:5], v9
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, 64, v9
+; SDAG-NEXT:    v_lshl_b64 v[34:35], v[2:3], v9
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; SDAG-NEXT:    v_lshr_b64 v[6:7], v[2:3], v6
+; SDAG-NEXT:    v_or_b32_e32 v7, v15, v7
+; SDAG-NEXT:    v_or_b32_e32 v6, v14, v6
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v13, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v12, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v35, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v34, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v8, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v13, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_11
+; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[2:3], v30
+; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
+; SDAG-NEXT:    v_lshr_b64 v[37:38], v[4:5], v30
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v14, 0
+; SDAG-NEXT:    v_mov_b32_e32 v15, 0
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v13, 0
+; SDAG-NEXT:    v_lshl_b64 v[48:49], v[4:5], v35
+; SDAG-NEXT:    v_lshr_b64 v[4:5], v[4:5], v36
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v29, vcc
+; SDAG-NEXT:    v_or_b32_e32 v11, v11, v49
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v48
+; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v5, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v4, v10, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, v38, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v37, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
+; SDAG-NEXT:  .LBB0_9: ; %udiv-do-while
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 31, v3
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v9
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v7
+; SDAG-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v38
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v39
+; SDAG-NEXT:    v_or_b32_e32 v9, v13, v9
+; SDAG-NEXT:    v_or_b32_e32 v7, v15, v7
+; SDAG-NEXT:    v_or_b32_e32 v8, v12, v8
+; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v34, v2
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v35, v3, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v36, v4, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v37, v5, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v15, 31, v10
+; SDAG-NEXT:    v_and_b32_e32 v10, 1, v15
+; SDAG-NEXT:    v_and_b32_e32 v38, v15, v1
+; SDAG-NEXT:    v_and_b32_e32 v39, v15, v0
+; SDAG-NEXT:    v_and_b32_e32 v48, v15, v29
+; SDAG-NEXT:    v_and_b32_e32 v15, v15, v28
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v15
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v48, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v39, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v38, vcc
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT:    v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT:    v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v6, v14, v6
+; SDAG-NEXT:    v_mov_b32_e32 v15, v11
+; SDAG-NEXT:    v_mov_b32_e32 v14, v10
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB0_9
+; SDAG-NEXT:  ; %bb.10: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB0_11: ; %Flow11
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[8:9], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[6:7], 1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_or_b32_e32 v13, v13, v1
+; SDAG-NEXT:    v_or_b32_e32 v14, v11, v3
+; SDAG-NEXT:    v_or_b32_e32 v11, v12, v0
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v2
+; SDAG-NEXT:  .LBB0_12: ; %Flow12
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_xor_b32_e32 v3, v27, v26
+; SDAG-NEXT:    v_xor_b32_e32 v2, v25, v24
+; SDAG-NEXT:    v_xor_b32_e32 v7, v23, v22
+; SDAG-NEXT:    v_xor_b32_e32 v6, v19, v18
+; SDAG-NEXT:    v_xor_b32_e32 v4, v20, v3
+; SDAG-NEXT:    v_xor_b32_e32 v5, v17, v2
+; SDAG-NEXT:    v_xor_b32_e32 v1, v21, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, v16, v2
+; SDAG-NEXT:    v_xor_b32_e32 v8, v13, v7
+; SDAG-NEXT:    v_xor_b32_e32 v9, v11, v6
+; SDAG-NEXT:    v_xor_b32_e32 v11, v14, v7
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v5, v2, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v4, v10, v6
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v11, v7, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v9, v6, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v8, v7, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sdiv_v2i128_vv:
+; GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v24, 31, v3
+; GISEL-NEXT:    v_ashrrev_i32_e32 v25, 31, v11
+; GISEL-NEXT:    v_mov_b32_e32 v20, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v21, 0
+; GISEL-NEXT:    v_xor_b32_e32 v0, v24, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v24, v1
+; GISEL-NEXT:    v_xor_b32_e32 v2, v24, v2
+; GISEL-NEXT:    v_xor_b32_e32 v3, v24, v3
+; GISEL-NEXT:    v_xor_b32_e32 v8, v25, v8
+; GISEL-NEXT:    v_xor_b32_e32 v9, v25, v9
+; GISEL-NEXT:    v_xor_b32_e32 v10, v25, v10
+; GISEL-NEXT:    v_xor_b32_e32 v11, v25, v11
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v0, v24
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, v1, v24, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v26, s[4:5], v8, v25
+; GISEL-NEXT:    v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, v2, v24, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v19, vcc, v3, v24, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v10, vcc, v10, v25, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v11, v25, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v8, v27
+; GISEL-NEXT:    v_ffbh_u32_e32 v9, v26
+; GISEL-NEXT:    v_ffbh_u32_e32 v22, v17
+; GISEL-NEXT:    v_ffbh_u32_e32 v23, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v26, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v27, v11
+; GISEL-NEXT:    v_or_b32_e32 v2, v16, v18
+; GISEL-NEXT:    v_or_b32_e32 v3, v17, v19
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
+; GISEL-NEXT:    v_ffbh_u32_e32 v28, v11
+; GISEL-NEXT:    v_ffbh_u32_e32 v29, v10
+; GISEL-NEXT:    v_add_i32_e32 v23, vcc, 32, v23
+; GISEL-NEXT:    v_ffbh_u32_e32 v30, v19
+; GISEL-NEXT:    v_ffbh_u32_e32 v31, v18
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT:    v_min_u32_e32 v0, v8, v9
+; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v29
+; GISEL-NEXT:    v_min_u32_e32 v2, v22, v23
+; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v31
+; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT:    v_min_u32_e32 v1, v28, v1
+; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 64, v2
+; GISEL-NEXT:    v_min_u32_e32 v3, v30, v3
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[20:21]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v0
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v8, v8, v2
+; GISEL-NEXT:    v_or_b32_e32 v9, v1, v3
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v9, v22, v20
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v9
+; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, v16, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, v17, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v18, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v19, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_6
+; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT:    v_add_i32_e32 v28, vcc, 1, v0
+; GISEL-NEXT:    v_addc_u32_e64 v29, s[4:5], 0, v1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v32, vcc, 0x7f, v0
+; GISEL-NEXT:    v_addc_u32_e64 v30, vcc, 0, v2, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v20, s[4:5], 64, v32
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], 64, v32
+; GISEL-NEXT:    v_lshl_b64 v[0:1], v[16:17], v32
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[18:19], v32
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[8:9], v[16:17], v8
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[16:17], v20
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v32
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, 0, v1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v8, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v9, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v0, v18, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v1, v19, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_5
+; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT:    v_subrev_i32_e32 v34, vcc, 64, v28
+; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v28
+; GISEL-NEXT:    v_lshr_b64 v[0:1], v[18:19], v28
+; GISEL-NEXT:    v_lshr_b64 v[2:3], v[16:17], v28
+; GISEL-NEXT:    v_add_i32_e32 v32, vcc, -1, v26
+; GISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v28
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v28
+; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, -1, v27, vcc
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[18:19], v22
+; GISEL-NEXT:    v_lshr_b64 v[36:37], v[18:19], v34
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, v1, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v34, vcc, -1, v10, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v2, v22
+; GISEL-NEXT:    v_or_b32_e32 v1, v3, v23
+; GISEL-NEXT:    v_addc_u32_e32 v35, vcc, -1, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v36, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v37, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, v0, v16, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, v1, v17, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v17, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; GISEL-NEXT:  .LBB0_3: ; %udiv-do-while3
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v16, 31, v21
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT:    v_lshl_b64 v[36:37], v[22:23], 1
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[18:19], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 31, v23
+; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 31, v9
+; GISEL-NEXT:    v_add_i32_e32 v28, vcc, -1, v28
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; GISEL-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT:    v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v18, v22
+; GISEL-NEXT:    v_or_b32_e32 v3, v36, v23
+; GISEL-NEXT:    v_addc_u32_e32 v30, vcc, -1, v30, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; GISEL-NEXT:    v_or_b32_e32 v8, v8, v16
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v32, v3
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v33, v37, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v28, v30
+; GISEL-NEXT:    v_or_b32_e32 v1, v29, v31
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v34, v2, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v35, v19, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
+; GISEL-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GISEL-NEXT:    v_and_b32_e32 v1, v0, v26
+; GISEL-NEXT:    v_and_b32_e32 v18, v0, v27
+; GISEL-NEXT:    v_and_b32_e32 v16, 1, v0
+; GISEL-NEXT:    v_and_b32_e32 v36, v0, v10
+; GISEL-NEXT:    v_and_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, v3, v1
+; GISEL-NEXT:    v_subb_u32_e32 v23, vcc, v37, v18, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, v2, v36, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v19, vcc, v19, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v17
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    s_cbranch_execnz .LBB0_3
+; GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:  .LBB0_5: ; %Flow14
+; GISEL-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 31, v21
+; GISEL-NEXT:    v_or_b32_e32 v8, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT:  .LBB0_6: ; %Flow16
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v18, 31, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v19, 31, v15
+; GISEL-NEXT:    v_mov_b32_e32 v10, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL-NEXT:    v_xor_b32_e32 v0, v18, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v18, v5
+; GISEL-NEXT:    v_xor_b32_e32 v2, v18, v6
+; GISEL-NEXT:    v_xor_b32_e32 v3, v18, v7
+; GISEL-NEXT:    v_xor_b32_e32 v4, v19, v12
+; GISEL-NEXT:    v_xor_b32_e32 v5, v19, v13
+; GISEL-NEXT:    v_xor_b32_e32 v14, v19, v14
+; GISEL-NEXT:    v_xor_b32_e32 v15, v19, v15
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v0, v18
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v1, v18, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v22, s[4:5], v4, v19
+; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, v2, v18, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v3, v18, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v4, vcc, v14, v19, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v15, v19, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v14, v23
+; GISEL-NEXT:    v_ffbh_u32_e32 v15, v22
+; GISEL-NEXT:    v_ffbh_u32_e32 v16, v7
+; GISEL-NEXT:    v_ffbh_u32_e32 v17, v6
+; GISEL-NEXT:    v_or_b32_e32 v0, v22, v4
+; GISEL-NEXT:    v_or_b32_e32 v1, v23, v5
+; GISEL-NEXT:    v_or_b32_e32 v2, v6, v12
+; GISEL-NEXT:    v_or_b32_e32 v3, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
+; GISEL-NEXT:    v_ffbh_u32_e32 v26, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v27, v4
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 32, v17
+; GISEL-NEXT:    v_ffbh_u32_e32 v28, v13
+; GISEL-NEXT:    v_ffbh_u32_e32 v29, v12
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT:    v_min_u32_e32 v0, v14, v15
+; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v27
+; GISEL-NEXT:    v_min_u32_e32 v2, v16, v17
+; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v29
+; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT:    v_min_u32_e32 v1, v26, v1
+; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 64, v2
+; GISEL-NEXT:    v_min_u32_e32 v3, v28, v3
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v10, 0x7f, v0
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v10, v10, v2
+; GISEL-NEXT:    v_or_b32_e32 v11, v1, v3
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v11, v14, v15
+; GISEL-NEXT:    v_and_b32_e32 v14, 1, v11
+; GISEL-NEXT:    v_or_b32_e32 v10, v11, v10
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v6, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v16, 1, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, v7, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v12, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_12
+; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, 1, v0
+; GISEL-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, v1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v30, vcc, 0x7f, v0
+; GISEL-NEXT:    v_addc_u32_e64 v28, vcc, 0, v2, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v14, s[4:5], 64, v30
+; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 64, v30
+; GISEL-NEXT:    v_lshl_b64 v[0:1], v[6:7], v30
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[12:13], v30
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[10:11], v[6:7], v10
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[6:7], v14
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v30
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, 0, v1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v10, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v11, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v0, v12, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v1, v13, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_11
+; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT:    v_subrev_i32_e32 v32, vcc, 64, v26
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 64, v26
+; GISEL-NEXT:    v_lshr_b64 v[0:1], v[12:13], v26
+; GISEL-NEXT:    v_lshr_b64 v[2:3], v[6:7], v26
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_add_i32_e32 v30, vcc, -1, v22
+; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, -1, v23, vcc
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[12:13], v16
+; GISEL-NEXT:    v_lshr_b64 v[12:13], v[12:13], v32
+; GISEL-NEXT:    v_addc_u32_e32 v32, vcc, -1, v4, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, -1, v5, vcc
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v16
+; GISEL-NEXT:    v_or_b32_e32 v3, v3, v17
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v17, 0, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v2, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v3, v7, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:  .LBB0_9: ; %udiv-do-while
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[12:13], 1
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 31, v13
+; GISEL-NEXT:    v_lshrrev_b32_e32 v34, 31, v11
+; GISEL-NEXT:    v_lshl_b64 v[12:13], v[14:15], 1
+; GISEL-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 31, v15
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, -1, v26
+; GISEL-NEXT:    v_addc_u32_e32 v27, vcc, -1, v27, vcc
+; GISEL-NEXT:    v_or_b32_e32 v16, v16, v6
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v34
+; GISEL-NEXT:    v_or_b32_e32 v10, v10, v14
+; GISEL-NEXT:    v_or_b32_e32 v14, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v15, v1, v13
+; GISEL-NEXT:    v_addc_u32_e32 v28, vcc, -1, v28, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v30, v2
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v31, v3, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v26, v28
+; GISEL-NEXT:    v_or_b32_e32 v1, v27, v29
+; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v32, v16, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v33, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v6
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v6, 1, v0
+; GISEL-NEXT:    v_and_b32_e32 v12, v0, v22
+; GISEL-NEXT:    v_and_b32_e32 v13, v0, v23
+; GISEL-NEXT:    v_and_b32_e32 v34, v0, v4
+; GISEL-NEXT:    v_and_b32_e32 v35, v0, v5
+; GISEL-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-NEXT:    v_mov_b32_e32 v1, v7
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v3, v13, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v16, v34, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, v17, v35, vcc
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execnz .LBB0_9
+; GISEL-NEXT:  ; %bb.10: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB0_11: ; %Flow11
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[14:15], 1
+; GISEL-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 31, v15
+; GISEL-NEXT:    v_or_b32_e32 v10, v10, v4
+; GISEL-NEXT:    v_or_b32_e32 v14, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v15, v1, v3
+; GISEL-NEXT:  .LBB0_12: ; %Flow12
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    v_xor_b32_e32 v3, v25, v24
+; GISEL-NEXT:    v_xor_b32_e32 v7, v19, v18
+; GISEL-NEXT:    v_xor_b32_e32 v0, v20, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v21, v3
+; GISEL-NEXT:    v_xor_b32_e32 v2, v8, v3
+; GISEL-NEXT:    v_xor_b32_e32 v6, v9, v3
+; GISEL-NEXT:    v_xor_b32_e32 v4, v14, v7
+; GISEL-NEXT:    v_xor_b32_e32 v5, v15, v7
+; GISEL-NEXT:    v_xor_b32_e32 v8, v10, v7
+; GISEL-NEXT:    v_xor_b32_e32 v9, v11, v7
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v7
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v7, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v3, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v8, v7, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v9, v7, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = sdiv <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
 
 define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_udiv_v2i128_vv:
+; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v17, v9, v11
+; SDAG-NEXT:    v_or_b32_e32 v16, v8, v10
+; SDAG-NEXT:    v_or_b32_e32 v19, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v18, v0, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v10
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v11
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v8
+; SDAG-NEXT:    v_ffbh_u32_e32 v23, v9
+; SDAG-NEXT:    v_ffbh_u32_e32 v24, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v25, v3
+; SDAG-NEXT:    v_ffbh_u32_e32 v26, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v27, v1
+; SDAG-NEXT:    v_mov_b32_e32 v28, 0
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v22
+; SDAG-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v24
+; SDAG-NEXT:    v_add_i32_e64 v19, s[6:7], 32, v26
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    v_min_u32_e32 v16, v16, v21
+; SDAG-NEXT:    v_min_u32_e32 v17, v17, v23
+; SDAG-NEXT:    v_min_u32_e32 v18, v18, v25
+; SDAG-NEXT:    v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT:    v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT:    v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v20, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v23, vcc, v16, v18
+; SDAG-NEXT:    v_subb_u32_e32 v24, vcc, v20, v17, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v23
+; SDAG-NEXT:    v_subbrev_u32_e32 v25, vcc, 0, v28, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v26, vcc, 0, v28, vcc
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v25
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[25:26]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v24, v26
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[25:26]
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v18
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v3, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v2, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v0, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_6
+; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT:    v_add_i32_e32 v18, vcc, 1, v23
+; SDAG-NEXT:    v_sub_i32_e64 v16, s[4:5], 63, v23
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_addc_u32_e32 v27, vcc, 0, v24, vcc
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[0:1], v16
+; SDAG-NEXT:    v_addc_u32_e32 v28, vcc, 0, v25, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v29, vcc, 0, v26, vcc
+; SDAG-NEXT:    v_or_b32_e32 v19, v18, v28
+; SDAG-NEXT:    v_sub_i32_e32 v30, vcc, 0x7f, v23
+; SDAG-NEXT:    v_or_b32_e32 v20, v27, v29
+; SDAG-NEXT:    v_lshl_b64 v[23:24], v[2:3], v30
+; SDAG-NEXT:    v_sub_i32_e32 v31, vcc, 64, v30
+; SDAG-NEXT:    v_lshl_b64 v[25:26], v[0:1], v30
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[19:20]
+; SDAG-NEXT:    v_lshr_b64 v[19:20], v[0:1], v31
+; SDAG-NEXT:    v_or_b32_e32 v20, v24, v20
+; SDAG-NEXT:    v_or_b32_e32 v19, v23, v19
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v16, v19, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v24, 0, v26, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, v25, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v16, v2, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_5
+; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT:    v_lshr_b64 v[21:22], v[0:1], v18
+; SDAG-NEXT:    v_sub_i32_e32 v31, vcc, 64, v18
+; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v18
+; SDAG-NEXT:    v_lshr_b64 v[32:33], v[2:3], v18
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v8
+; SDAG-NEXT:    s_mov_b64 s[12:13], 0
+; SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; SDAG-NEXT:    v_mov_b32_e32 v26, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v18
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v18
+; SDAG-NEXT:    v_lshl_b64 v[34:35], v[2:3], v31
+; SDAG-NEXT:    v_lshr_b64 v[36:37], v[2:3], v36
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v9, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v33, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v32, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v22, v22, v35
+; SDAG-NEXT:    v_or_b32_e32 v21, v21, v34
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, v37, v22, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v11, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v22, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v21, v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:  .LBB1_3: ; %udiv-do-while3
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v21, 31, v24
+; SDAG-NEXT:    v_lshl_b64 v[23:24], v[23:24], 1
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v34, 31, v1
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v35, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT:    v_or_b32_e32 v24, v26, v24
+; SDAG-NEXT:    v_or_b32_e32 v23, v25, v23
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v34
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v35
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v21
+; SDAG-NEXT:    v_sub_i32_e32 v21, vcc, v30, v0
+; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, v31, v1, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, v32, v2, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, v33, v3, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v21, 31, v21
+; SDAG-NEXT:    v_and_b32_e32 v25, v21, v8
+; SDAG-NEXT:    v_and_b32_e32 v26, v21, v9
+; SDAG-NEXT:    v_and_b32_e32 v34, v21, v10
+; SDAG-NEXT:    v_and_b32_e32 v35, v21, v11
+; SDAG-NEXT:    v_and_b32_e32 v21, 1, v21
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v25
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v26, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v2, v34, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v35, vcc
+; SDAG-NEXT:    v_add_i32_e32 v18, vcc, -1, v18
+; SDAG-NEXT:    v_addc_u32_e32 v27, vcc, -1, v27, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v28, vcc, -1, v28, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; SDAG-NEXT:    v_or_b32_e32 v25, v18, v28
+; SDAG-NEXT:    v_or_b32_e32 v26, v27, v29
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[25:26]
+; SDAG-NEXT:    v_or_b32_e32 v17, v20, v17
+; SDAG-NEXT:    s_or_b64 s[12:13], vcc, s[12:13]
+; SDAG-NEXT:    v_or_b32_e32 v16, v19, v16
+; SDAG-NEXT:    v_mov_b32_e32 v26, v22
+; SDAG-NEXT:    v_mov_b32_e32 v25, v21
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[12:13]
+; SDAG-NEXT:    s_cbranch_execnz .LBB1_3
+; SDAG-NEXT:  ; %bb.4: ; %Flow13
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB1_5: ; %Flow14
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[16:17], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v24
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[23:24], 1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v8
+; SDAG-NEXT:    v_or_b32_e32 v16, v20, v1
+; SDAG-NEXT:    v_or_b32_e32 v18, v22, v3
+; SDAG-NEXT:    v_or_b32_e32 v17, v19, v0
+; SDAG-NEXT:    v_or_b32_e32 v19, v21, v2
+; SDAG-NEXT:  .LBB1_6: ; %Flow16
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_or_b32_e32 v1, v13, v15
+; SDAG-NEXT:    v_or_b32_e32 v0, v12, v14
+; SDAG-NEXT:    v_or_b32_e32 v3, v5, v7
+; SDAG-NEXT:    v_or_b32_e32 v2, v4, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v8, v14
+; SDAG-NEXT:    v_ffbh_u32_e32 v9, v15
+; SDAG-NEXT:    v_ffbh_u32_e32 v10, v12
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v13
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v7
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, 0
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT:    v_add_i32_e64 v0, s[6:7], 32, v8
+; SDAG-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v10
+; SDAG-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v20
+; SDAG-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v22
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    v_min_u32_e32 v0, v0, v9
+; SDAG-NEXT:    v_min_u32_e32 v1, v1, v11
+; SDAG-NEXT:    v_min_u32_e32 v2, v2, v21
+; SDAG-NEXT:    v_min_u32_e32 v3, v3, v23
+; SDAG-NEXT:    v_add_i32_e32 v1, vcc, 64, v1
+; SDAG-NEXT:    v_addc_u32_e64 v8, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_add_i32_e32 v3, vcc, 64, v3
+; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v8, v1, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v8, 0x7f, v0
+; SDAG-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v24, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v24, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v9, v1, v3
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v8, 1, v10
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v7, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v6, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v5, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v4, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_12
+; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
+; SDAG-NEXT:    v_sub_i32_e64 v9, s[4:5], 63, v0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_lshl_b64 v[9:10], v[4:5], v9
+; SDAG-NEXT:    v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v25, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v8, v24
+; SDAG-NEXT:    v_sub_i32_e32 v3, vcc, 0x7f, v0
+; SDAG-NEXT:    v_or_b32_e32 v2, v11, v25
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[6:7], v3
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, 64, v3
+; SDAG-NEXT:    v_lshl_b64 v[26:27], v[4:5], v3
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[1:2]
+; SDAG-NEXT:    v_lshr_b64 v[0:1], v[4:5], v0
+; SDAG-NEXT:    v_or_b32_e32 v1, v23, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v22, v0
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v3
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v27, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v26, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v2, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v9, v6, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; SDAG-NEXT:    v_mov_b32_e32 v10, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_11
+; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT:    v_lshr_b64 v[20:21], v[4:5], v8
+; SDAG-NEXT:    v_sub_i32_e32 v27, vcc, 64, v8
+; SDAG-NEXT:    v_subrev_i32_e32 v28, vcc, 64, v8
+; SDAG-NEXT:    v_lshr_b64 v[29:30], v[6:7], v8
+; SDAG-NEXT:    v_add_i32_e32 v26, vcc, -1, v12
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_mov_b32_e32 v23, 0
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; SDAG-NEXT:    v_mov_b32_e32 v10, 0
+; SDAG-NEXT:    v_lshl_b64 v[31:32], v[6:7], v27
+; SDAG-NEXT:    v_lshr_b64 v[6:7], v[6:7], v28
+; SDAG-NEXT:    v_addc_u32_e32 v27, vcc, -1, v13, vcc
+; SDAG-NEXT:    v_or_b32_e32 v21, v21, v32
+; SDAG-NEXT:    v_or_b32_e32 v20, v20, v31
+; SDAG-NEXT:    v_addc_u32_e32 v28, vcc, -1, v14, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v8
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v7, v21, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v6, v20, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v30, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v29, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v29, vcc, -1, v15, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:  .LBB1_9: ; %udiv-do-while
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v5
+; SDAG-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v30, 31, v3
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v31, 31, v1
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT:    v_or_b32_e32 v6, v6, v20
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v30
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v31
+; SDAG-NEXT:    v_or_b32_e32 v3, v10, v3
+; SDAG-NEXT:    v_or_b32_e32 v1, v23, v1
+; SDAG-NEXT:    v_or_b32_e32 v2, v9, v2
+; SDAG-NEXT:    v_sub_i32_e32 v20, vcc, v26, v4
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, v27, v5, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, v28, v6, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, v29, v7, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v23, 31, v20
+; SDAG-NEXT:    v_and_b32_e32 v20, 1, v23
+; SDAG-NEXT:    v_and_b32_e32 v30, v23, v15
+; SDAG-NEXT:    v_and_b32_e32 v31, v23, v14
+; SDAG-NEXT:    v_and_b32_e32 v32, v23, v13
+; SDAG-NEXT:    v_and_b32_e32 v23, v23, v12
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v23
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v32, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v6, v31, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v30, vcc
+; SDAG-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
+; SDAG-NEXT:    v_addc_u32_e32 v11, vcc, -1, v11, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v24, vcc, -1, v24, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v25, vcc, -1, v25, vcc
+; SDAG-NEXT:    v_or_b32_e32 v31, v11, v25
+; SDAG-NEXT:    v_or_b32_e32 v30, v8, v24
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[30:31]
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v0, v22, v0
+; SDAG-NEXT:    v_mov_b32_e32 v23, v21
+; SDAG-NEXT:    v_mov_b32_e32 v22, v20
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB1_9
+; SDAG-NEXT:  ; %bb.10: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB1_11: ; %Flow11
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v4
+; SDAG-NEXT:    v_or_b32_e32 v8, v10, v3
+; SDAG-NEXT:    v_or_b32_e32 v10, v21, v1
+; SDAG-NEXT:    v_or_b32_e32 v9, v9, v2
+; SDAG-NEXT:    v_or_b32_e32 v11, v20, v0
+; SDAG-NEXT:  .LBB1_12: ; %Flow12
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v19
+; SDAG-NEXT:    v_mov_b32_e32 v1, v18
+; SDAG-NEXT:    v_mov_b32_e32 v2, v17
+; SDAG-NEXT:    v_mov_b32_e32 v3, v16
+; SDAG-NEXT:    v_mov_b32_e32 v4, v11
+; SDAG-NEXT:    v_mov_b32_e32 v5, v10
+; SDAG-NEXT:    v_mov_b32_e32 v6, v9
+; SDAG-NEXT:    v_mov_b32_e32 v7, v8
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_udiv_v2i128_vv:
+; GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v16, v2
+; GISEL-NEXT:    v_mov_b32_e32 v17, v3
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_or_b32_e32 v2, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v3, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v18, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v19, v1, v17
+; GISEL-NEXT:    v_ffbh_u32_e32 v20, v9
+; GISEL-NEXT:    v_ffbh_u32_e32 v21, v8
+; GISEL-NEXT:    v_ffbh_u32_e32 v22, v11
+; GISEL-NEXT:    v_ffbh_u32_e32 v23, v10
+; GISEL-NEXT:    v_ffbh_u32_e32 v26, v1
+; GISEL-NEXT:    v_ffbh_u32_e32 v27, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v28, v17
+; GISEL-NEXT:    v_ffbh_u32_e32 v29, v16
+; GISEL-NEXT:    v_mov_b32_e32 v24, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v25, 0
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v21
+; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v23
+; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v27
+; GISEL-NEXT:    v_add_i32_e64 v19, s[6:7], 32, v29
+; GISEL-NEXT:    v_min_u32_e32 v2, v20, v2
+; GISEL-NEXT:    v_min_u32_e32 v3, v22, v3
+; GISEL-NEXT:    v_min_u32_e32 v18, v26, v18
+; GISEL-NEXT:    v_min_u32_e32 v19, v28, v19
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 64, v2
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, 64, v18
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v20, vcc, v2, v3
+; GISEL-NEXT:    v_subb_u32_e64 v21, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[20:21], v[24:25]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v20
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v22
+; GISEL-NEXT:    v_or_b32_e32 v3, v21, v23
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v26, v18
+; GISEL-NEXT:    v_and_b32_e32 v18, 1, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v24, 1, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, v1, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v16, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v24
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_6
+; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, 1, v20
+; GISEL-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, v21, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v30, vcc, 0x7f, v20
+; GISEL-NEXT:    v_addc_u32_e64 v28, vcc, 0, v22, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v23, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v22, s[4:5], 64, v30
+; GISEL-NEXT:    v_sub_i32_e64 v20, s[4:5], 64, v30
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[0:1], v30
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[16:17], v30
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[20:21], v[0:1], v20
+; GISEL-NEXT:    v_lshl_b64 v[24:25], v[0:1], v22
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v30
+; GISEL-NEXT:    v_cndmask_b32_e32 v22, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v23, 0, v3, vcc
+; GISEL-NEXT:    v_or_b32_e32 v2, v20, v18
+; GISEL-NEXT:    v_or_b32_e32 v3, v21, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v24, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v25, v3, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v21, s11
+; GISEL-NEXT:    v_mov_b32_e32 v20, s10
+; GISEL-NEXT:    v_mov_b32_e32 v19, s9
+; GISEL-NEXT:    v_mov_b32_e32 v18, s8
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_5
+; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT:    v_subrev_i32_e32 v32, vcc, 64, v26
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 64, v26
+; GISEL-NEXT:    v_lshr_b64 v[18:19], v[16:17], v26
+; GISEL-NEXT:    v_lshr_b64 v[20:21], v[0:1], v26
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_add_i32_e32 v30, vcc, -1, v8
+; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, -1, v9, vcc
+; GISEL-NEXT:    v_lshl_b64 v[24:25], v[16:17], v24
+; GISEL-NEXT:    v_lshr_b64 v[16:17], v[16:17], v32
+; GISEL-NEXT:    v_addc_u32_e32 v32, vcc, -1, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, -1, v11, vcc
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v20, v20, v24
+; GISEL-NEXT:    v_or_b32_e32 v21, v21, v25
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, v16, v20, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, v17, v21, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, 0, v18, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v17, 0, v19, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v24, v20, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v25, v21, v1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v21, s7
+; GISEL-NEXT:    v_mov_b32_e32 v20, s6
+; GISEL-NEXT:    v_mov_b32_e32 v19, s5
+; GISEL-NEXT:    v_mov_b32_e32 v18, s4
+; GISEL-NEXT:  .LBB1_3: ; %udiv-do-while3
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v34, 31, v23
+; GISEL-NEXT:    v_lshl_b64 v[20:21], v[22:23], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v25
+; GISEL-NEXT:    v_lshl_b64 v[24:25], v[24:25], 1
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v35, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, -1, v26
+; GISEL-NEXT:    v_addc_u32_e32 v27, vcc, -1, v27, vcc
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GISEL-NEXT:    v_or_b32_e32 v22, v18, v20
+; GISEL-NEXT:    v_or_b32_e32 v23, v19, v21
+; GISEL-NEXT:    v_or_b32_e32 v16, v16, v0
+; GISEL-NEXT:    v_or_b32_e32 v20, v24, v35
+; GISEL-NEXT:    v_addc_u32_e32 v28, vcc, -1, v28, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v30, v20
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v31, v25, vcc
+; GISEL-NEXT:    v_or_b32_e32 v18, v26, v28
+; GISEL-NEXT:    v_or_b32_e32 v19, v27, v29
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v32, v16, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v33, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v18, v0, v8
+; GISEL-NEXT:    v_and_b32_e32 v19, v0, v9
+; GISEL-NEXT:    v_and_b32_e32 v21, v0, v10
+; GISEL-NEXT:    v_and_b32_e32 v35, v0, v11
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, v20, v18
+; GISEL-NEXT:    v_subb_u32_e32 v25, vcc, v25, v19, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v16, v21, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, v17, v35, vcc
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v34
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execnz .LBB1_3
+; GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB1_5: ; %Flow14
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_b64 v[0:1], v[22:23], 1
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v8, 31, v23
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v0
+; GISEL-NEXT:    v_or_b32_e32 v19, v19, v1
+; GISEL-NEXT:  .LBB1_6: ; %Flow16
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_or_b32_e32 v0, v12, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v13, v15
+; GISEL-NEXT:    v_or_b32_e32 v8, v4, v6
+; GISEL-NEXT:    v_or_b32_e32 v9, v5, v7
+; GISEL-NEXT:    v_ffbh_u32_e32 v16, v13
+; GISEL-NEXT:    v_ffbh_u32_e32 v17, v12
+; GISEL-NEXT:    v_ffbh_u32_e32 v20, v15
+; GISEL-NEXT:    v_ffbh_u32_e32 v21, v14
+; GISEL-NEXT:    v_ffbh_u32_e32 v22, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v23, v4
+; GISEL-NEXT:    v_ffbh_u32_e32 v24, v7
+; GISEL-NEXT:    v_ffbh_u32_e32 v25, v6
+; GISEL-NEXT:    v_mov_b32_e32 v10, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 32, v17
+; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v21
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], 32, v23
+; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], 32, v25
+; GISEL-NEXT:    v_min_u32_e32 v0, v16, v0
+; GISEL-NEXT:    v_min_u32_e32 v1, v20, v1
+; GISEL-NEXT:    v_min_u32_e32 v8, v22, v8
+; GISEL-NEXT:    v_min_u32_e32 v9, v24, v9
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 64, v8
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v0
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v8, v8, v16
+; GISEL-NEXT:    v_or_b32_e32 v9, v1, v17
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v9, v20, v10
+; GISEL-NEXT:    v_and_b32_e32 v10, 1, v9
+; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v4, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v5, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v6, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_12
+; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
+; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, v1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 0x7f, v0
+; GISEL-NEXT:    v_addc_u32_e64 v24, vcc, 0, v16, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v9, s[4:5], 64, v26
+; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 64, v26
+; GISEL-NEXT:    v_lshl_b64 v[0:1], v[4:5], v26
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[6:7], v26
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[20:21], v[4:5], v10
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[4:5], v9
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v20, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v21, v17
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v23, s11
+; GISEL-NEXT:    v_mov_b32_e32 v22, s10
+; GISEL-NEXT:    v_mov_b32_e32 v21, s9
+; GISEL-NEXT:    v_mov_b32_e32 v20, s8
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_11
+; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT:    v_subrev_i32_e32 v28, vcc, 64, v8
+; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v8
+; GISEL-NEXT:    v_lshr_b64 v[16:17], v[6:7], v8
+; GISEL-NEXT:    v_lshr_b64 v[20:21], v[4:5], v8
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_add_i32_e32 v26, vcc, -1, v12
+; GISEL-NEXT:    v_addc_u32_e32 v27, vcc, -1, v13, vcc
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[6:7], v22
+; GISEL-NEXT:    v_lshr_b64 v[6:7], v[6:7], v28
+; GISEL-NEXT:    v_addc_u32_e32 v28, vcc, -1, v14, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, -1, v15, vcc
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v20, v20, v22
+; GISEL-NEXT:    v_or_b32_e32 v21, v21, v23
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v20, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v21, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_mov_b32_e32 v23, s7
+; GISEL-NEXT:    v_mov_b32_e32 v22, s6
+; GISEL-NEXT:    v_mov_b32_e32 v21, s5
+; GISEL-NEXT:    v_mov_b32_e32 v20, s4
+; GISEL-NEXT:  .LBB1_9: ; %udiv-do-while
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[6:7], 1
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GISEL-NEXT:    v_lshrrev_b32_e32 v30, 31, v1
+; GISEL-NEXT:    v_lshl_b64 v[6:7], v[9:10], 1
+; GISEL-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 31, v10
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, -1, v11, vcc
+; GISEL-NEXT:    v_or_b32_e32 v16, v16, v4
+; GISEL-NEXT:    v_or_b32_e32 v22, v22, v30
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v9, v20, v6
+; GISEL-NEXT:    v_or_b32_e32 v10, v21, v7
+; GISEL-NEXT:    v_addc_u32_e32 v24, vcc, -1, v24, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v25, vcc, -1, v25, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v26, v22
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v27, v23, vcc
+; GISEL-NEXT:    v_or_b32_e32 v6, v8, v24
+; GISEL-NEXT:    v_or_b32_e32 v7, v11, v25
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v28, v16, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v29, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v4, 1, v6
+; GISEL-NEXT:    v_and_b32_e32 v7, v6, v12
+; GISEL-NEXT:    v_and_b32_e32 v30, v6, v13
+; GISEL-NEXT:    v_and_b32_e32 v31, v6, v14
+; GISEL-NEXT:    v_and_b32_e32 v32, v6, v15
+; GISEL-NEXT:    v_mov_b32_e32 v21, v5
+; GISEL-NEXT:    v_mov_b32_e32 v20, v4
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v22, v7
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v23, v30, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v16, v31, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, v17, v32, vcc
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execnz .LBB1_9
+; GISEL-NEXT:  ; %bb.10: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB1_11: ; %Flow11
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_b64 v[4:5], v[9:10], 1
+; GISEL-NEXT:    v_lshl_b64 v[8:9], v[0:1], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v10
+; GISEL-NEXT:    v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT:    v_or_b32_e32 v10, v20, v4
+; GISEL-NEXT:    v_or_b32_e32 v11, v21, v5
+; GISEL-NEXT:  .LBB1_12: ; %Flow12
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v18
+; GISEL-NEXT:    v_mov_b32_e32 v1, v19
+; GISEL-NEXT:    v_mov_b32_e32 v4, v10
+; GISEL-NEXT:    v_mov_b32_e32 v5, v11
+; GISEL-NEXT:    v_mov_b32_e32 v6, v8
+; GISEL-NEXT:    v_mov_b32_e32 v7, v9
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = udiv <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
 
 define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_srem_v2i128_vv:
+; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SDAG-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
+; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v11
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    v_mov_b32_e32 v29, v28
+; SDAG-NEXT:    v_xor_b32_e32 v18, v3, v28
+; SDAG-NEXT:    v_xor_b32_e32 v19, v2, v28
+; SDAG-NEXT:    v_xor_b32_e32 v1, v1, v28
+; SDAG-NEXT:    v_xor_b32_e32 v0, v0, v28
+; SDAG-NEXT:    v_xor_b32_e32 v11, v11, v16
+; SDAG-NEXT:    v_xor_b32_e32 v10, v10, v16
+; SDAG-NEXT:    v_xor_b32_e32 v20, v9, v16
+; SDAG-NEXT:    v_xor_b32_e32 v9, v8, v16
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v0, v28
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v1, v28, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v2
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v19, v28, vcc
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v3
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v18, v28, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v2, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v0
+; SDAG-NEXT:    v_min_u32_e32 v19, v19, v21
+; SDAG-NEXT:    v_sub_i32_e32 v31, vcc, v9, v16
+; SDAG-NEXT:    v_or_b32_e32 v9, v3, v1
+; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 32, v18
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v1
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 64, v19
+; SDAG-NEXT:    v_addc_u32_e64 v22, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v30, vcc, v20, v16, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; SDAG-NEXT:    v_ffbh_u32_e32 v9, v31
+; SDAG-NEXT:    v_min_u32_e32 v18, v18, v21
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v22, 0, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v10, v16, vcc
+; SDAG-NEXT:    v_add_i32_e64 v21, s[8:9], 32, v9
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v19, v18, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v11, v16, vcc
+; SDAG-NEXT:    v_or_b32_e32 v10, v31, v8
+; SDAG-NEXT:    v_ffbh_u32_e32 v16, v8
+; SDAG-NEXT:    v_min_u32_e32 v19, v21, v22
+; SDAG-NEXT:    v_or_b32_e32 v11, v30, v9
+; SDAG-NEXT:    v_add_i32_e32 v16, vcc, 32, v16
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v9
+; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT:    v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_min_u32_e32 v10, v16, v21
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v22, 0, s[6:7]
+; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v19, v10, s[6:7]
+; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v10, v18
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v20, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v10
+; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v18
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v11, v19
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v20
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v35, v1, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v32, v0, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v27, v3, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v33, v2, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_6
+; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT:    v_add_i32_e32 v32, vcc, 1, v10
+; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v10
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v11, vcc
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[2:3], v20
+; SDAG-NEXT:    v_addc_u32_e32 v34, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v18, v32, v34
+; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v10
+; SDAG-NEXT:    v_or_b32_e32 v19, v33, v35
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[0:1], v24
+; SDAG-NEXT:    v_sub_i32_e32 v25, vcc, 64, v24
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[2:3], v24
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_lshr_b64 v[18:19], v[2:3], v25
+; SDAG-NEXT:    v_or_b32_e32 v11, v11, v19
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v18
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v21, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v20, v10, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v22, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v11, v1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v10, v0, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_5
+; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT:    v_lshr_b64 v[16:17], v[2:3], v32
+; SDAG-NEXT:    v_sub_i32_e32 v26, vcc, 64, v32
+; SDAG-NEXT:    v_subrev_i32_e32 v37, vcc, 64, v32
+; SDAG-NEXT:    v_lshr_b64 v[24:25], v[0:1], v32
+; SDAG-NEXT:    v_add_i32_e32 v36, vcc, -1, v31
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_mov_b32_e32 v23, 0
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_lshl_b64 v[26:27], v[0:1], v26
+; SDAG-NEXT:    v_lshr_b64 v[48:49], v[0:1], v37
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v30, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v17, v27
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v26
+; SDAG-NEXT:    v_addc_u32_e32 v38, vcc, -1, v8, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v32
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v49, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v48, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v27, 0, v25, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v26, 0, v24, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v9, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
+; SDAG-NEXT:    v_cndmask_b32_e32 v25, v17, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v24, v16, v2, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:  .LBB2_3: ; %udiv-do-while3
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[26:27], v[26:27], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v25
+; SDAG-NEXT:    v_lshl_b64 v[24:25], v[24:25], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v48, 31, v11
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v49, 31, v21
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT:    v_or_b32_e32 v26, v26, v16
+; SDAG-NEXT:    v_or_b32_e32 v24, v24, v48
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v49
+; SDAG-NEXT:    v_or_b32_e32 v11, v19, v11
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v36, v24
+; SDAG-NEXT:    v_or_b32_e32 v10, v18, v10
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v37, v25, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v38, v26, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v39, v27, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
+; SDAG-NEXT:    v_and_b32_e32 v48, v16, v31
+; SDAG-NEXT:    v_and_b32_e32 v49, v16, v30
+; SDAG-NEXT:    v_and_b32_e32 v50, v16, v8
+; SDAG-NEXT:    v_and_b32_e32 v51, v16, v9
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v16
+; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, v24, v48
+; SDAG-NEXT:    v_subb_u32_e32 v25, vcc, v25, v49, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v26, vcc, v26, v50, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v27, vcc, v27, v51, vcc
+; SDAG-NEXT:    v_add_i32_e32 v32, vcc, -1, v32
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v34, vcc, -1, v34, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v35, vcc
+; SDAG-NEXT:    v_or_b32_e32 v48, v32, v34
+; SDAG-NEXT:    v_or_b32_e32 v49, v33, v35
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[48:49]
+; SDAG-NEXT:    v_or_b32_e32 v21, v23, v21
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v20, v22, v20
+; SDAG-NEXT:    v_mov_b32_e32 v23, v17
+; SDAG-NEXT:    v_mov_b32_e32 v22, v16
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB2_3
+; SDAG-NEXT:  ; %bb.4: ; %Flow13
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB2_5: ; %Flow14
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v22, 31, v21
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v22
+; SDAG-NEXT:    v_or_b32_e32 v35, v19, v11
+; SDAG-NEXT:    v_or_b32_e32 v27, v17, v21
+; SDAG-NEXT:    v_or_b32_e32 v32, v18, v10
+; SDAG-NEXT:    v_or_b32_e32 v33, v16, v20
+; SDAG-NEXT:  .LBB2_6: ; %Flow16
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
+; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    v_mov_b32_e32 v34, v26
+; SDAG-NEXT:    v_xor_b32_e32 v10, v7, v26
+; SDAG-NEXT:    v_xor_b32_e32 v11, v6, v26
+; SDAG-NEXT:    v_xor_b32_e32 v5, v5, v26
+; SDAG-NEXT:    v_xor_b32_e32 v4, v4, v26
+; SDAG-NEXT:    v_xor_b32_e32 v15, v15, v16
+; SDAG-NEXT:    v_xor_b32_e32 v14, v14, v16
+; SDAG-NEXT:    v_xor_b32_e32 v13, v13, v16
+; SDAG-NEXT:    v_xor_b32_e32 v12, v12, v16
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v4, v26
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v5, v26, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v6
+; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v11, v26, vcc
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v7
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v10, v26, vcc
+; SDAG-NEXT:    v_or_b32_e32 v10, v6, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v19, v4
+; SDAG-NEXT:    v_min_u32_e32 v18, v11, v18
+; SDAG-NEXT:    v_sub_i32_e32 v37, vcc, v12, v16
+; SDAG-NEXT:    v_or_b32_e32 v11, v7, v5
+; SDAG-NEXT:    v_add_i32_e64 v12, s[4:5], 32, v19
+; SDAG-NEXT:    v_ffbh_u32_e32 v19, v5
+; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 64, v18
+; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v36, vcc, v13, v16, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v37
+; SDAG-NEXT:    v_min_u32_e32 v12, v12, v19
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v20, 0, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v14, v16, vcc
+; SDAG-NEXT:    v_add_i32_e64 v13, s[8:9], 32, v11
+; SDAG-NEXT:    v_ffbh_u32_e32 v14, v36
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v12, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v15, v16, vcc
+; SDAG-NEXT:    v_or_b32_e32 v12, v37, v10
+; SDAG-NEXT:    v_ffbh_u32_e32 v15, v10
+; SDAG-NEXT:    v_min_u32_e32 v14, v13, v14
+; SDAG-NEXT:    v_or_b32_e32 v13, v36, v11
+; SDAG-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
+; SDAG-NEXT:    v_ffbh_u32_e32 v16, v11
+; SDAG-NEXT:    v_add_i32_e32 v14, vcc, 64, v14
+; SDAG-NEXT:    v_addc_u32_e64 v20, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT:    v_min_u32_e32 v12, v15, v16
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v20, 0, s[6:7]
+; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v14, v12, s[6:7]
+; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, v12, v18
+; SDAG-NEXT:    v_subb_u32_e32 v13, vcc, v13, v19, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v12
+; SDAG-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v14
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v13, v15
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v18
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v5, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v4, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v7, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v6, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_12
+; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT:    v_add_i32_e32 v38, vcc, 1, v12
+; SDAG-NEXT:    v_sub_i32_e64 v18, s[4:5], 63, v12
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, 0, v13, vcc
+; SDAG-NEXT:    v_lshl_b64 v[18:19], v[6:7], v18
+; SDAG-NEXT:    v_addc_u32_e32 v48, vcc, 0, v14, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v49, vcc, 0, v15, vcc
+; SDAG-NEXT:    v_or_b32_e32 v13, v38, v48
+; SDAG-NEXT:    v_sub_i32_e32 v15, vcc, 0x7f, v12
+; SDAG-NEXT:    v_or_b32_e32 v14, v39, v49
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[4:5], v15
+; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, 64, v15
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[6:7], v15
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[13:14]
+; SDAG-NEXT:    v_lshr_b64 v[12:13], v[6:7], v12
+; SDAG-NEXT:    v_or_b32_e32 v13, v21, v13
+; SDAG-NEXT:    v_or_b32_e32 v12, v20, v12
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v15
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v19, v13, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v12, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v22, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v14, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v18, v4, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_11
+; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT:    v_lshr_b64 v[16:17], v[6:7], v38
+; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, 64, v38
+; SDAG-NEXT:    v_subrev_i32_e32 v51, vcc, 64, v38
+; SDAG-NEXT:    v_lshr_b64 v[22:23], v[4:5], v38
+; SDAG-NEXT:    v_add_i32_e32 v50, vcc, -1, v37
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_lshl_b64 v[24:25], v[4:5], v24
+; SDAG-NEXT:    v_lshr_b64 v[53:54], v[4:5], v51
+; SDAG-NEXT:    v_addc_u32_e32 v51, vcc, -1, v36, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v17, v25
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v24
+; SDAG-NEXT:    v_addc_u32_e32 v52, vcc, -1, v10, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v38
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v54, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v53, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v25, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v24, 0, v22, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v53, vcc, -1, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v38
+; SDAG-NEXT:    v_cndmask_b32_e32 v23, v17, v7, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v22, v16, v6, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:  .LBB2_9: ; %udiv-do-while
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[24:25], v[24:25], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v23
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v54, 31, v15
+; SDAG-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v55, 31, v13
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT:    v_or_b32_e32 v24, v24, v16
+; SDAG-NEXT:    v_or_b32_e32 v22, v22, v54
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v55
+; SDAG-NEXT:    v_or_b32_e32 v15, v19, v15
+; SDAG-NEXT:    v_or_b32_e32 v13, v21, v13
+; SDAG-NEXT:    v_or_b32_e32 v14, v18, v14
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v50, v22
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v51, v23, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v52, v24, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v53, v25, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v21, 31, v16
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v21
+; SDAG-NEXT:    v_and_b32_e32 v54, v21, v11
+; SDAG-NEXT:    v_and_b32_e32 v55, v21, v10
+; SDAG-NEXT:    v_and_b32_e32 v40, v21, v36
+; SDAG-NEXT:    v_and_b32_e32 v21, v21, v37
+; SDAG-NEXT:    v_sub_i32_e32 v22, vcc, v22, v21
+; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, v23, v40, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v24, vcc, v24, v55, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v25, vcc, v25, v54, vcc
+; SDAG-NEXT:    v_add_i32_e32 v38, vcc, -1, v38
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v39, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v48, vcc, -1, v48, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v49, vcc, -1, v49, vcc
+; SDAG-NEXT:    v_or_b32_e32 v55, v39, v49
+; SDAG-NEXT:    v_or_b32_e32 v54, v38, v48
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[54:55]
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v12, v20, v12
+; SDAG-NEXT:    v_mov_b32_e32 v21, v17
+; SDAG-NEXT:    v_mov_b32_e32 v20, v16
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB2_9
+; SDAG-NEXT:  ; %bb.10: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB2_11: ; %Flow11
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v13
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v20
+; SDAG-NEXT:    v_or_b32_e32 v19, v19, v15
+; SDAG-NEXT:    v_or_b32_e32 v17, v17, v13
+; SDAG-NEXT:    v_or_b32_e32 v18, v18, v14
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v12
+; SDAG-NEXT:  .LBB2_12: ; %Flow12
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mul_lo_u32 v14, v33, v9
+; SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v33, v8, 0
+; SDAG-NEXT:    v_mul_lo_u32 v24, v27, v8
+; SDAG-NEXT:    v_mul_lo_u32 v25, v35, v31
+; SDAG-NEXT:    v_mul_lo_u32 v35, v32, v30
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v33, 0
+; SDAG-NEXT:    v_mov_b32_e32 v15, 0
+; SDAG-NEXT:    v_mul_lo_u32 v38, v16, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v39, v17, v10
+; SDAG-NEXT:    v_mul_lo_u32 v19, v19, v37
+; SDAG-NEXT:    v_mul_lo_u32 v48, v18, v36
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v37, v16, 0
+; SDAG-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; SDAG-NEXT:    v_mov_b32_e32 v14, v9
+; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15]
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; SDAG-NEXT:    v_add_i32_e64 v14, s[4:5], v21, v38
+; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v24
+; SDAG-NEXT:    v_mov_b32_e32 v24, v23
+; SDAG-NEXT:    v_mov_b32_e32 v23, v15
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v27, v[22:23]
+; SDAG-NEXT:    v_xor_b32_e32 v33, v2, v28
+; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v14, v39
+; SDAG-NEXT:    v_mov_b32_e32 v14, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v36, v16, v[14:15]
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v32, v31, v[12:13]
+; SDAG-NEXT:    v_mov_b32_e32 v2, v9
+; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], v24, v2
+; SDAG-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v2, v8
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v3, v2, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21]
+; SDAG-NEXT:    v_mov_b32_e32 v18, v23
+; SDAG-NEXT:    v_mov_b32_e32 v23, v15
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v37, v17, v[22:23]
+; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], v25, v12
+; SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v30, v27, v[13:14]
+; SDAG-NEXT:    v_xor_b32_e32 v16, v16, v29
+; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v19, v3
+; SDAG-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v9
+; SDAG-NEXT:    v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v18, v8
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v35, v20
+; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v48, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v36, v17, v[14:15]
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; SDAG-NEXT:    v_addc_u32_e64 v12, s[4:5], v13, v19, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v0, v11, vcc
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v2
+; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v3, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v2, v0, v28
+; SDAG-NEXT:    v_xor_b32_e32 v3, v1, v29
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v33, v28
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v16, v29, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v2, v28, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v29, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v6, v10
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v18, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v6, v6, v26
+; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v8, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v34
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v9, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v8, v4, v26
+; SDAG-NEXT:    v_xor_b32_e32 v9, v5, v34
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v6, v26
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v7, v34, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v8, v26, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v34, vcc
+; SDAG-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_srem_v2i128_vv:
+; GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
+; GISEL-NEXT:    v_ashrrev_i32_e32 v20, 31, v11
+; GISEL-NEXT:    v_mov_b32_e32 v18, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v19, 0
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v28
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v28
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v28
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v20
+; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v20
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v20
+; GISEL-NEXT:    v_xor_b32_e32 v11, v11, v20
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v0, v28
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, v1, v28, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v30, s[4:5], v8, v20
+; GISEL-NEXT:    v_subb_u32_e64 v29, s[4:5], v9, v20, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v2, v28, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, v3, v28, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v10, vcc, v10, v20, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v11, v20, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v20, v29
+; GISEL-NEXT:    v_ffbh_u32_e32 v21, v30
+; GISEL-NEXT:    v_ffbh_u32_e32 v22, v17
+; GISEL-NEXT:    v_ffbh_u32_e32 v23, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v30, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v29, v11
+; GISEL-NEXT:    v_or_b32_e32 v2, v16, v8
+; GISEL-NEXT:    v_or_b32_e32 v3, v17, v9
+; GISEL-NEXT:    v_add_i32_e32 v21, vcc, 32, v21
+; GISEL-NEXT:    v_ffbh_u32_e32 v24, v11
+; GISEL-NEXT:    v_ffbh_u32_e32 v25, v10
+; GISEL-NEXT:    v_add_i32_e32 v23, vcc, 32, v23
+; GISEL-NEXT:    v_ffbh_u32_e32 v26, v9
+; GISEL-NEXT:    v_ffbh_u32_e32 v27, v8
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT:    v_min_u32_e32 v0, v20, v21
+; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v25
+; GISEL-NEXT:    v_min_u32_e32 v2, v22, v23
+; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v27
+; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT:    v_min_u32_e32 v1, v24, v1
+; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 64, v2
+; GISEL-NEXT:    v_min_u32_e32 v3, v26, v3
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[18:19]
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v0
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v2
+; GISEL-NEXT:    v_or_b32_e32 v19, v1, v3
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v19, v20, v21
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v19
+; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v31, v16, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v32, v17, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, v8, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT:    v_add_i32_e32 v31, vcc, 1, v0
+; GISEL-NEXT:    v_addc_u32_e64 v32, s[4:5], 0, v1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v0
+; GISEL-NEXT:    v_addc_u32_e64 v33, vcc, 0, v2, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v34, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v20, s[4:5], 64, v24
+; GISEL-NEXT:    v_sub_i32_e64 v18, s[4:5], 64, v24
+; GISEL-NEXT:    v_lshl_b64 v[0:1], v[16:17], v24
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[8:9], v24
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[18:19], v[16:17], v18
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[16:17], v20
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, 0, v1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v18, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v19, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, v0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, v1, v9, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v31
+; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v31
+; GISEL-NEXT:    v_lshr_b64 v[0:1], v[8:9], v31
+; GISEL-NEXT:    v_lshr_b64 v[2:3], v[16:17], v31
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_add_i32_e32 v35, vcc, -1, v30
+; GISEL-NEXT:    v_addc_u32_e32 v36, vcc, -1, v29, vcc
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[8:9], v22
+; GISEL-NEXT:    v_lshr_b64 v[24:25], v[8:9], v24
+; GISEL-NEXT:    v_addc_u32_e32 v37, vcc, -1, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v38, vcc, -1, v11, vcc
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v22
+; GISEL-NEXT:    v_or_b32_e32 v3, v3, v23
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v31
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v24, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v25, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v26, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v27, 0, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; GISEL-NEXT:    v_cndmask_b32_e32 v24, v2, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v25, v3, v17, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v23, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:  .LBB2_3: ; %udiv-do-while3
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 31, v21
+; GISEL-NEXT:    v_lshl_b64 v[48:49], v[24:25], 1
+; GISEL-NEXT:    v_lshl_b64 v[26:27], v[26:27], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 31, v25
+; GISEL-NEXT:    v_lshrrev_b32_e32 v25, 31, v19
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[18:19], 1
+; GISEL-NEXT:    v_add_i32_e32 v31, vcc, -1, v31
+; GISEL-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; GISEL-NEXT:    v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v26, v24
+; GISEL-NEXT:    v_or_b32_e32 v3, v48, v25
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v22
+; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v34, vcc, -1, v34, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v35, v3
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v36, v49, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v31, v33
+; GISEL-NEXT:    v_or_b32_e32 v1, v32, v34
+; GISEL-NEXT:    v_subb_u32_e32 v22, vcc, v37, v2, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v22, vcc, v38, v27, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v22
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, v0, v30
+; GISEL-NEXT:    v_and_b32_e32 v25, v0, v29
+; GISEL-NEXT:    v_and_b32_e32 v26, v0, v10
+; GISEL-NEXT:    v_and_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, v3, v1
+; GISEL-NEXT:    v_subb_u32_e32 v25, vcc, v49, v25, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v26, vcc, v2, v26, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v27, vcc, v27, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v22
+; GISEL-NEXT:    v_mov_b32_e32 v1, v23
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execnz .LBB2_3
+; GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB2_5: ; %Flow14
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[18:19], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 31, v21
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v20
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v32, v1, v3
+; GISEL-NEXT:  .LBB2_6: ; %Flow16
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v33, 31, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v15
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v4, v33
+; GISEL-NEXT:    v_xor_b32_e32 v4, v5, v33
+; GISEL-NEXT:    v_xor_b32_e32 v5, v6, v33
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v33
+; GISEL-NEXT:    v_xor_b32_e32 v6, v12, v0
+; GISEL-NEXT:    v_xor_b32_e32 v20, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v14, v14, v0
+; GISEL-NEXT:    v_xor_b32_e32 v15, v15, v0
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v1, v33
+; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v4, v33, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v35, s[4:5], v6, v0
+; GISEL-NEXT:    v_subb_u32_e64 v34, s[4:5], v20, v0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v5, v33, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v33, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v4, vcc, v14, v0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v15, v0, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v20, v34
+; GISEL-NEXT:    v_ffbh_u32_e32 v21, v35
+; GISEL-NEXT:    v_ffbh_u32_e32 v22, v13
+; GISEL-NEXT:    v_ffbh_u32_e32 v23, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v35, v4
+; GISEL-NEXT:    v_or_b32_e32 v1, v34, v5
+; GISEL-NEXT:    v_or_b32_e32 v14, v12, v6
+; GISEL-NEXT:    v_or_b32_e32 v15, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v21, vcc, 32, v21
+; GISEL-NEXT:    v_ffbh_u32_e32 v24, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v25, v4
+; GISEL-NEXT:    v_add_i32_e32 v23, vcc, 32, v23
+; GISEL-NEXT:    v_ffbh_u32_e32 v26, v7
+; GISEL-NEXT:    v_ffbh_u32_e32 v27, v6
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GISEL-NEXT:    v_min_u32_e32 v0, v20, v21
+; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v25
+; GISEL-NEXT:    v_min_u32_e32 v14, v22, v23
+; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], 32, v27
+; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT:    v_min_u32_e32 v1, v24, v1
+; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], 64, v14
+; GISEL-NEXT:    v_min_u32_e32 v15, v26, v15
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v14, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v0
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v14
+; GISEL-NEXT:    v_or_b32_e32 v3, v1, v15
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v20, v21
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, v12, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, v13, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_12
+; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT:    v_add_i32_e32 v36, vcc, 1, v0
+; GISEL-NEXT:    v_addc_u32_e64 v37, s[4:5], 0, v1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v0
+; GISEL-NEXT:    v_addc_u32_e64 v38, vcc, 0, v14, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v39, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v20, s[4:5], 64, v24
+; GISEL-NEXT:    v_sub_i32_e64 v14, s[4:5], 64, v24
+; GISEL-NEXT:    v_lshl_b64 v[0:1], v[12:13], v24
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[6:7], v24
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[14:15], v[12:13], v14
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[12:13], v20
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, 0, v1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v14, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v15, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, v1, v7, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_11
+; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v36
+; GISEL-NEXT:    v_sub_i32_e32 v22, vcc, 64, v36
+; GISEL-NEXT:    v_lshr_b64 v[0:1], v[6:7], v36
+; GISEL-NEXT:    v_lshr_b64 v[2:3], v[12:13], v36
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_add_i32_e32 v48, vcc, -1, v35
+; GISEL-NEXT:    v_addc_u32_e32 v49, vcc, -1, v34, vcc
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[6:7], v22
+; GISEL-NEXT:    v_lshr_b64 v[24:25], v[6:7], v24
+; GISEL-NEXT:    v_addc_u32_e32 v50, vcc, -1, v4, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v51, vcc, -1, v5, vcc
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v22
+; GISEL-NEXT:    v_or_b32_e32 v3, v3, v23
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v36
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v24, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v25, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v26, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v27, 0, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v36
+; GISEL-NEXT:    v_cndmask_b32_e32 v24, v2, v12, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v25, v3, v13, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v23, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:  .LBB2_9: ; %udiv-do-while
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 31, v21
+; GISEL-NEXT:    v_lshl_b64 v[52:53], v[24:25], 1
+; GISEL-NEXT:    v_lshl_b64 v[26:27], v[26:27], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 31, v25
+; GISEL-NEXT:    v_lshrrev_b32_e32 v25, 31, v15
+; GISEL-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; GISEL-NEXT:    v_add_i32_e32 v36, vcc, -1, v36
+; GISEL-NEXT:    v_addc_u32_e32 v37, vcc, -1, v37, vcc
+; GISEL-NEXT:    v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v26, v24
+; GISEL-NEXT:    v_or_b32_e32 v3, v52, v25
+; GISEL-NEXT:    v_or_b32_e32 v14, v14, v22
+; GISEL-NEXT:    v_addc_u32_e32 v38, vcc, -1, v38, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v39, vcc, -1, v39, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v48, v3
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v49, v53, vcc
+; GISEL-NEXT:    v_or_b32_e32 v0, v36, v38
+; GISEL-NEXT:    v_or_b32_e32 v1, v37, v39
+; GISEL-NEXT:    v_subb_u32_e32 v22, vcc, v50, v2, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v22, vcc, v51, v27, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v22
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, v0, v35
+; GISEL-NEXT:    v_and_b32_e32 v25, v0, v34
+; GISEL-NEXT:    v_and_b32_e32 v26, v0, v4
+; GISEL-NEXT:    v_and_b32_e32 v52, v0, v5
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, v3, v1
+; GISEL-NEXT:    v_subb_u32_e32 v25, vcc, v53, v25, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v22
+; GISEL-NEXT:    v_mov_b32_e32 v1, v23
+; GISEL-NEXT:    v_subb_u32_e32 v26, vcc, v2, v26, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v27, vcc, v27, v52, vcc
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execnz .LBB2_9
+; GISEL-NEXT:  ; %bb.10: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB2_11: ; %Flow11
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[20:21], 1
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[14:15], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 31, v21
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v14
+; GISEL-NEXT:    v_or_b32_e32 v20, v0, v22
+; GISEL-NEXT:    v_or_b32_e32 v21, v1, v23
+; GISEL-NEXT:  .LBB2_12: ; %Flow12
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
+; GISEL-NEXT:    v_mul_lo_u32 v24, v30, v19
+; GISEL-NEXT:    v_mul_lo_u32 v25, v29, v18
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
+; GISEL-NEXT:    v_mul_lo_u32 v26, v35, v3
+; GISEL-NEXT:    v_mul_lo_u32 v27, v34, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v22, v19
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
+; GISEL-NEXT:    v_mov_b32_e32 v23, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
+; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
+; GISEL-NEXT:    v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v28
+; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v18
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v16, v12, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v28
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v14, v14, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[8:9], v16, v33
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v9, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v28
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v33
+; GISEL-NEXT:    v_xor_b32_e32 v7, v8, v28
+; GISEL-NEXT:    v_xor_b32_e32 v8, v3, v33
+; GISEL-NEXT:    v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v28, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v8, v33, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = srem <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
 
 define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_urem_v2i128_vv:
+; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v17, v9, v11
+; SDAG-NEXT:    v_or_b32_e32 v16, v8, v10
+; SDAG-NEXT:    v_or_b32_e32 v19, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v18, v0, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v10
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v11
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v8
+; SDAG-NEXT:    v_ffbh_u32_e32 v23, v9
+; SDAG-NEXT:    v_ffbh_u32_e32 v24, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v25, v3
+; SDAG-NEXT:    v_ffbh_u32_e32 v26, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v27, v1
+; SDAG-NEXT:    v_mov_b32_e32 v28, 0
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v22
+; SDAG-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v24
+; SDAG-NEXT:    v_add_i32_e64 v19, s[6:7], 32, v26
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    v_min_u32_e32 v16, v16, v21
+; SDAG-NEXT:    v_min_u32_e32 v17, v17, v23
+; SDAG-NEXT:    v_min_u32_e32 v18, v18, v25
+; SDAG-NEXT:    v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT:    v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT:    v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v20, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v16, v18
+; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, v20, v17, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v18, 0x7f, v16
+; SDAG-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
+; SDAG-NEXT:    v_or_b32_e32 v18, v18, v20
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v19, v17, v21
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_and_b32_e32 v18, 1, v22
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v18
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v33, v3, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v31, v2, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v30, v1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v32, v0, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_6
+; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v16
+; SDAG-NEXT:    v_sub_i32_e64 v22, s[4:5], 63, v16
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[0:1], v22
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v20, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v21, vcc
+; SDAG-NEXT:    v_or_b32_e32 v20, v30, v32
+; SDAG-NEXT:    v_sub_i32_e32 v26, vcc, 0x7f, v16
+; SDAG-NEXT:    v_or_b32_e32 v21, v31, v33
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[2:3], v26
+; SDAG-NEXT:    v_sub_i32_e32 v27, vcc, 64, v26
+; SDAG-NEXT:    v_lshl_b64 v[24:25], v[0:1], v26
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT:    v_lshr_b64 v[20:21], v[0:1], v27
+; SDAG-NEXT:    v_or_b32_e32 v17, v17, v21
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v26
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v23, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v22, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, v25, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, 0, v24, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v16, v2, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_5
+; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT:    v_lshr_b64 v[18:19], v[0:1], v30
+; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, 64, v30
+; SDAG-NEXT:    v_subrev_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT:    v_lshr_b64 v[26:27], v[2:3], v30
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v8
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v24, 0
+; SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:    v_lshl_b64 v[28:29], v[2:3], v28
+; SDAG-NEXT:    v_lshr_b64 v[37:38], v[2:3], v35
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v19, v19, v29
+; SDAG-NEXT:    v_or_b32_e32 v18, v18, v28
+; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v10, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v38, v19, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v37, v18, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v29, 0, v27, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v28, 0, v26, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT:    v_cndmask_b32_e32 v27, v19, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v26, v18, v0, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:  .LBB3_3: ; %udiv-do-while3
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[28:29], v[28:29], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v18, 31, v27
+; SDAG-NEXT:    v_lshl_b64 v[26:27], v[26:27], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v23
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT:    v_or_b32_e32 v28, v28, v18
+; SDAG-NEXT:    v_or_b32_e32 v26, v26, v38
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v39
+; SDAG-NEXT:    v_or_b32_e32 v17, v21, v17
+; SDAG-NEXT:    v_sub_i32_e32 v18, vcc, v34, v26
+; SDAG-NEXT:    v_or_b32_e32 v16, v20, v16
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, v35, v27, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, v36, v28, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, v37, v29, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v38, 31, v18
+; SDAG-NEXT:    v_and_b32_e32 v39, v38, v8
+; SDAG-NEXT:    v_and_b32_e32 v48, v38, v9
+; SDAG-NEXT:    v_and_b32_e32 v49, v38, v10
+; SDAG-NEXT:    v_and_b32_e32 v18, 1, v38
+; SDAG-NEXT:    v_and_b32_e32 v38, v38, v11
+; SDAG-NEXT:    v_sub_i32_e32 v26, vcc, v26, v39
+; SDAG-NEXT:    v_subb_u32_e32 v27, vcc, v27, v48, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v28, vcc, v28, v49, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v29, v38, vcc
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT:    v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT:    v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT:    v_or_b32_e32 v23, v25, v23
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v22, v24, v22
+; SDAG-NEXT:    v_mov_b32_e32 v25, v19
+; SDAG-NEXT:    v_mov_b32_e32 v24, v18
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB3_3
+; SDAG-NEXT:  ; %bb.4: ; %Flow13
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB3_5: ; %Flow14
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v24, 31, v23
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v24
+; SDAG-NEXT:    v_or_b32_e32 v33, v21, v17
+; SDAG-NEXT:    v_or_b32_e32 v30, v19, v23
+; SDAG-NEXT:    v_or_b32_e32 v31, v20, v16
+; SDAG-NEXT:    v_or_b32_e32 v32, v18, v22
+; SDAG-NEXT:  .LBB3_6: ; %Flow16
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_or_b32_e32 v17, v13, v15
+; SDAG-NEXT:    v_or_b32_e32 v16, v12, v14
+; SDAG-NEXT:    v_or_b32_e32 v19, v5, v7
+; SDAG-NEXT:    v_or_b32_e32 v18, v4, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v14
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v15
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v12
+; SDAG-NEXT:    v_ffbh_u32_e32 v23, v13
+; SDAG-NEXT:    v_ffbh_u32_e32 v24, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v25, v7
+; SDAG-NEXT:    v_ffbh_u32_e32 v26, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v27, v5
+; SDAG-NEXT:    v_mov_b32_e32 v28, 0
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v22
+; SDAG-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v24
+; SDAG-NEXT:    v_add_i32_e64 v19, s[6:7], 32, v26
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    v_min_u32_e32 v16, v16, v21
+; SDAG-NEXT:    v_min_u32_e32 v17, v17, v23
+; SDAG-NEXT:    v_min_u32_e32 v18, v18, v25
+; SDAG-NEXT:    v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT:    v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT:    v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v20, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v16, v18
+; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, v20, v17, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v20, 0x7f, v16
+; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v28, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v28, vcc
+; SDAG-NEXT:    v_or_b32_e32 v20, v20, v18
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v21, v17, v19
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT:    v_and_b32_e32 v20, 1, v22
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v20
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v23, v7, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, v6, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v5, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v4, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_12
+; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, 1, v16
+; SDAG-NEXT:    v_sub_i32_e64 v22, s[4:5], 63, v16
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[4:5], v22
+; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v34, v36
+; SDAG-NEXT:    v_sub_i32_e32 v19, vcc, 0x7f, v16
+; SDAG-NEXT:    v_or_b32_e32 v18, v35, v37
+; SDAG-NEXT:    v_lshl_b64 v[24:25], v[6:7], v19
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 64, v19
+; SDAG-NEXT:    v_lshl_b64 v[26:27], v[4:5], v19
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[17:18]
+; SDAG-NEXT:    v_lshr_b64 v[16:17], v[4:5], v16
+; SDAG-NEXT:    v_or_b32_e32 v17, v25, v17
+; SDAG-NEXT:    v_or_b32_e32 v16, v24, v16
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v19
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v23, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, v22, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, 0, v27, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, 0, v26, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v18, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v22, v6, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_mov_b32_e32 v23, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_11
+; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT:    v_lshr_b64 v[20:21], v[4:5], v34
+; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, 64, v34
+; SDAG-NEXT:    v_subrev_i32_e32 v39, vcc, 64, v34
+; SDAG-NEXT:    v_lshr_b64 v[26:27], v[6:7], v34
+; SDAG-NEXT:    v_add_i32_e32 v38, vcc, -1, v12
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v24, 0
+; SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_mov_b32_e32 v23, 0
+; SDAG-NEXT:    v_lshl_b64 v[28:29], v[6:7], v28
+; SDAG-NEXT:    v_lshr_b64 v[49:50], v[6:7], v39
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v13, vcc
+; SDAG-NEXT:    v_or_b32_e32 v21, v21, v29
+; SDAG-NEXT:    v_or_b32_e32 v20, v20, v28
+; SDAG-NEXT:    v_addc_u32_e32 v48, vcc, -1, v14, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v50, v21, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v49, v20, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v29, 0, v27, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v28, 0, v26, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v49, vcc, -1, v15, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v34
+; SDAG-NEXT:    v_cndmask_b32_e32 v27, v21, v5, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v26, v20, v4, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; SDAG-NEXT:  .LBB3_9: ; %udiv-do-while
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[28:29], v[28:29], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v27
+; SDAG-NEXT:    v_lshl_b64 v[26:27], v[26:27], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v50, 31, v19
+; SDAG-NEXT:    v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v51, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT:    v_or_b32_e32 v28, v28, v20
+; SDAG-NEXT:    v_or_b32_e32 v26, v26, v50
+; SDAG-NEXT:    v_or_b32_e32 v18, v18, v51
+; SDAG-NEXT:    v_or_b32_e32 v19, v23, v19
+; SDAG-NEXT:    v_or_b32_e32 v17, v25, v17
+; SDAG-NEXT:    v_or_b32_e32 v18, v22, v18
+; SDAG-NEXT:    v_sub_i32_e32 v20, vcc, v38, v26
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, v39, v27, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, v48, v28, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, v49, v29, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v25, 31, v20
+; SDAG-NEXT:    v_and_b32_e32 v20, 1, v25
+; SDAG-NEXT:    v_and_b32_e32 v50, v25, v15
+; SDAG-NEXT:    v_and_b32_e32 v51, v25, v14
+; SDAG-NEXT:    v_and_b32_e32 v52, v25, v13
+; SDAG-NEXT:    v_and_b32_e32 v25, v25, v12
+; SDAG-NEXT:    v_sub_i32_e32 v26, vcc, v26, v25
+; SDAG-NEXT:    v_subb_u32_e32 v27, vcc, v27, v52, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v28, vcc, v28, v51, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v29, v50, vcc
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v34
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v35, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v36, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v37, vcc
+; SDAG-NEXT:    v_or_b32_e32 v51, v35, v37
+; SDAG-NEXT:    v_or_b32_e32 v50, v34, v36
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[50:51]
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v16, v24, v16
+; SDAG-NEXT:    v_mov_b32_e32 v25, v21
+; SDAG-NEXT:    v_mov_b32_e32 v24, v20
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB3_9
+; SDAG-NEXT:  ; %bb.10: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB3_11: ; %Flow11
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v24, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT:    v_or_b32_e32 v18, v18, v24
+; SDAG-NEXT:    v_or_b32_e32 v23, v23, v19
+; SDAG-NEXT:    v_or_b32_e32 v21, v21, v17
+; SDAG-NEXT:    v_or_b32_e32 v22, v22, v18
+; SDAG-NEXT:    v_or_b32_e32 v20, v20, v16
+; SDAG-NEXT:  .LBB3_12: ; %Flow12
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mul_lo_u32 v18, v32, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v28, v30, v10
+; SDAG-NEXT:    v_mul_lo_u32 v29, v33, v8
+; SDAG-NEXT:    v_mul_lo_u32 v33, v31, v9
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_mul_lo_u32 v34, v20, v15
+; SDAG-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0
+; SDAG-NEXT:    v_mul_lo_u32 v35, v21, v14
+; SDAG-NEXT:    v_mul_lo_u32 v23, v23, v12
+; SDAG-NEXT:    v_mul_lo_u32 v36, v22, v13
+; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0
+; SDAG-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; SDAG-NEXT:    v_mov_b32_e32 v18, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19]
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
+; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], v25, v34
+; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v17, v28
+; SDAG-NEXT:    v_mov_b32_e32 v28, v27
+; SDAG-NEXT:    v_mov_b32_e32 v27, v19
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27]
+; SDAG-NEXT:    v_add_i32_e64 v25, s[4:5], v18, v35
+; SDAG-NEXT:    v_mov_b32_e32 v18, v15
+; SDAG-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v13, v20, v[18:19]
+; SDAG-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17]
+; SDAG-NEXT:    v_mov_b32_e32 v8, v11
+; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v28, v8
+; SDAG-NEXT:    v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v8, v10
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25]
+; SDAG-NEXT:    v_mov_b32_e32 v22, v27
+; SDAG-NEXT:    v_mov_b32_e32 v27, v19
+; SDAG-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v12, v21, v[26:27]
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v29, v16
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18]
+; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v23, v11
+; SDAG-NEXT:    v_mov_b32_e32 v11, v20
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v22, v11
+; SDAG-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v33, v16
+; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v36, v17
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v13, v21, v[11:12]
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
+; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v2, v8, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; SDAG-NEXT:    v_add_i32_e32 v8, vcc, v11, v10
+; SDAG-NEXT:    v_addc_u32_e32 v9, vcc, v12, v17, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v10, v19
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v14
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v10, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v6, v8, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_urem_v2i128_vv:
+; GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_or_b32_e32 v16, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v17, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v18, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v19, v1, v3
+; GISEL-NEXT:    v_ffbh_u32_e32 v22, v9
+; GISEL-NEXT:    v_ffbh_u32_e32 v23, v8
+; GISEL-NEXT:    v_ffbh_u32_e32 v24, v11
+; GISEL-NEXT:    v_ffbh_u32_e32 v25, v10
+; GISEL-NEXT:    v_ffbh_u32_e32 v26, v1
+; GISEL-NEXT:    v_ffbh_u32_e32 v27, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v28, v3
+; GISEL-NEXT:    v_ffbh_u32_e32 v29, v2
+; GISEL-NEXT:    v_mov_b32_e32 v20, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v21, 0
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v23
+; GISEL-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v25
+; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v27
+; GISEL-NEXT:    v_add_i32_e64 v19, s[6:7], 32, v29
+; GISEL-NEXT:    v_min_u32_e32 v16, v22, v16
+; GISEL-NEXT:    v_min_u32_e32 v17, v24, v17
+; GISEL-NEXT:    v_min_u32_e32 v18, v26, v18
+; GISEL-NEXT:    v_min_u32_e32 v19, v28, v19
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 64, v16
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, 64, v18
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v18, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v19, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[16:17], v[20:21]
+; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v20, 0x7f, v16
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v20, v20, v18
+; GISEL-NEXT:    v_or_b32_e32 v21, v17, v19
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v21, v22, v23
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v21
+; GISEL-NEXT:    v_or_b32_e32 v20, v21, v20
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v32, v0, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v33, v1, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, v2, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT:    v_add_i32_e32 v30, vcc, 1, v16
+; GISEL-NEXT:    v_addc_u32_e64 v31, s[4:5], 0, v17, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 0x7f, v16
+; GISEL-NEXT:    v_addc_u32_e64 v32, vcc, 0, v18, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v22, s[4:5], 64, v26
+; GISEL-NEXT:    v_sub_i32_e64 v20, s[4:5], 64, v26
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[0:1], v26
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[2:3], v26
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[20:21], v[0:1], v20
+; GISEL-NEXT:    v_lshl_b64 v[24:25], v[0:1], v22
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v22, 0, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v23, 0, v17, vcc
+; GISEL-NEXT:    v_or_b32_e32 v16, v20, v18
+; GISEL-NEXT:    v_or_b32_e32 v17, v21, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v24, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v17, v25, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, v16, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, v17, v3, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v19, s11
+; GISEL-NEXT:    v_mov_b32_e32 v18, s10
+; GISEL-NEXT:    v_mov_b32_e32 v17, s9
+; GISEL-NEXT:    v_mov_b32_e32 v16, s8
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_5
+; GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT:    v_subrev_i32_e32 v26, vcc, 64, v30
+; GISEL-NEXT:    v_sub_i32_e32 v24, vcc, 64, v30
+; GISEL-NEXT:    v_lshr_b64 v[16:17], v[2:3], v30
+; GISEL-NEXT:    v_lshr_b64 v[18:19], v[0:1], v30
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_add_i32_e32 v34, vcc, -1, v8
+; GISEL-NEXT:    v_addc_u32_e32 v35, vcc, -1, v9, vcc
+; GISEL-NEXT:    v_lshl_b64 v[24:25], v[2:3], v24
+; GISEL-NEXT:    v_lshr_b64 v[26:27], v[2:3], v26
+; GISEL-NEXT:    v_addc_u32_e32 v36, vcc, -1, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v37, vcc, -1, v11, vcc
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v24
+; GISEL-NEXT:    v_or_b32_e32 v19, v19, v25
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v30
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, v26, v18, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, v27, v19, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v28, 0, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v29, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; GISEL-NEXT:    v_cndmask_b32_e32 v26, v18, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v27, v19, v1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v25, 0
+; GISEL-NEXT:    v_mov_b32_e32 v19, s7
+; GISEL-NEXT:    v_mov_b32_e32 v18, s6
+; GISEL-NEXT:    v_mov_b32_e32 v17, s5
+; GISEL-NEXT:    v_mov_b32_e32 v16, s4
+; GISEL-NEXT:  .LBB3_3: ; %udiv-do-while3
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[22:23], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 31, v23
+; GISEL-NEXT:    v_lshl_b64 v[38:39], v[26:27], 1
+; GISEL-NEXT:    v_lshl_b64 v[28:29], v[28:29], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v26, 31, v27
+; GISEL-NEXT:    v_lshrrev_b32_e32 v27, 31, v21
+; GISEL-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
+; GISEL-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
+; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; GISEL-NEXT:    v_or_b32_e32 v22, v16, v18
+; GISEL-NEXT:    v_or_b32_e32 v23, v17, v19
+; GISEL-NEXT:    v_or_b32_e32 v18, v28, v26
+; GISEL-NEXT:    v_or_b32_e32 v19, v38, v27
+; GISEL-NEXT:    v_or_b32_e32 v20, v20, v24
+; GISEL-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v34, v19
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v35, v39, vcc
+; GISEL-NEXT:    v_or_b32_e32 v16, v30, v32
+; GISEL-NEXT:    v_or_b32_e32 v17, v31, v33
+; GISEL-NEXT:    v_subb_u32_e32 v24, vcc, v36, v18, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v24, vcc, v37, v29, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v16, 31, v24
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v24, 1, v16
+; GISEL-NEXT:    v_and_b32_e32 v17, v16, v8
+; GISEL-NEXT:    v_and_b32_e32 v27, v16, v9
+; GISEL-NEXT:    v_and_b32_e32 v28, v16, v10
+; GISEL-NEXT:    v_and_b32_e32 v16, v16, v11
+; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, v19, v17
+; GISEL-NEXT:    v_subb_u32_e32 v27, vcc, v39, v27, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v28, vcc, v18, v28, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v29, vcc, v29, v16, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v16, v24
+; GISEL-NEXT:    v_mov_b32_e32 v17, v25
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execnz .LBB3_3
+; GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB3_5: ; %Flow14
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[22:23], 1
+; GISEL-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 31, v23
+; GISEL-NEXT:    v_or_b32_e32 v20, v20, v22
+; GISEL-NEXT:    v_or_b32_e32 v32, v16, v18
+; GISEL-NEXT:    v_or_b32_e32 v33, v17, v19
+; GISEL-NEXT:  .LBB3_6: ; %Flow16
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GISEL-NEXT:    v_or_b32_e32 v16, v12, v14
+; GISEL-NEXT:    v_or_b32_e32 v17, v13, v15
+; GISEL-NEXT:    v_or_b32_e32 v18, v4, v6
+; GISEL-NEXT:    v_or_b32_e32 v19, v5, v7
+; GISEL-NEXT:    v_ffbh_u32_e32 v22, v13
+; GISEL-NEXT:    v_ffbh_u32_e32 v23, v12
+; GISEL-NEXT:    v_ffbh_u32_e32 v26, v15
+; GISEL-NEXT:    v_ffbh_u32_e32 v27, v14
+; GISEL-NEXT:    v_ffbh_u32_e32 v28, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v29, v4
+; GISEL-NEXT:    v_ffbh_u32_e32 v30, v7
+; GISEL-NEXT:    v_ffbh_u32_e32 v31, v6
+; GISEL-NEXT:    v_mov_b32_e32 v24, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v25, 0
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v23
+; GISEL-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v27
+; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v29
+; GISEL-NEXT:    v_add_i32_e64 v19, s[6:7], 32, v31
+; GISEL-NEXT:    v_min_u32_e32 v16, v22, v16
+; GISEL-NEXT:    v_min_u32_e32 v17, v26, v17
+; GISEL-NEXT:    v_min_u32_e32 v18, v28, v18
+; GISEL-NEXT:    v_min_u32_e32 v19, v30, v19
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 64, v16
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, 64, v18
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v17, v19, v18, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[16:17], v[24:25]
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v16
+; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v22
+; GISEL-NEXT:    v_or_b32_e32 v19, v17, v23
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v19, v26, v24
+; GISEL-NEXT:    v_and_b32_e32 v24, 1, v19
+; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v24
+; GISEL-NEXT:    v_cndmask_b32_e64 v24, v4, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v26, 1, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v25, v5, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, v6, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_12
+; GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT:    v_add_i32_e32 v34, vcc, 1, v16
+; GISEL-NEXT:    v_addc_u32_e64 v35, s[4:5], 0, v17, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v28, vcc, 0x7f, v16
+; GISEL-NEXT:    v_addc_u32_e64 v36, vcc, 0, v22, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v37, vcc, 0, v23, vcc
+; GISEL-NEXT:    v_subrev_i32_e64 v24, s[4:5], 64, v28
+; GISEL-NEXT:    v_sub_i32_e64 v22, s[4:5], 64, v28
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[4:5], v28
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[6:7], v28
+; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    v_lshr_b64 v[22:23], v[4:5], v22
+; GISEL-NEXT:    v_lshl_b64 v[26:27], v[4:5], v24
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v28
+; GISEL-NEXT:    v_cndmask_b32_e32 v24, 0, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v25, 0, v17, vcc
+; GISEL-NEXT:    v_or_b32_e32 v16, v22, v18
+; GISEL-NEXT:    v_or_b32_e32 v17, v23, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v26, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v17, v27, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v28
+; GISEL-NEXT:    v_cndmask_b32_e32 v22, v16, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v23, v17, v7, vcc
+; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v19, s11
+; GISEL-NEXT:    v_mov_b32_e32 v18, s10
+; GISEL-NEXT:    v_mov_b32_e32 v17, s9
+; GISEL-NEXT:    v_mov_b32_e32 v16, s8
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_11
+; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT:    v_subrev_i32_e32 v28, vcc, 64, v34
+; GISEL-NEXT:    v_sub_i32_e32 v26, vcc, 64, v34
+; GISEL-NEXT:    v_lshr_b64 v[16:17], v[6:7], v34
+; GISEL-NEXT:    v_lshr_b64 v[18:19], v[4:5], v34
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_add_i32_e32 v38, vcc, -1, v12
+; GISEL-NEXT:    v_addc_u32_e32 v39, vcc, -1, v13, vcc
+; GISEL-NEXT:    v_lshl_b64 v[26:27], v[6:7], v26
+; GISEL-NEXT:    v_lshr_b64 v[28:29], v[6:7], v28
+; GISEL-NEXT:    v_addc_u32_e32 v48, vcc, -1, v14, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v49, vcc, -1, v15, vcc
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v26
+; GISEL-NEXT:    v_or_b32_e32 v19, v19, v27
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v34
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, v28, v18, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, v29, v19, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v30, 0, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v31, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v34
+; GISEL-NEXT:    v_cndmask_b32_e32 v28, v18, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v29, v19, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v27, 0
+; GISEL-NEXT:    v_mov_b32_e32 v19, s7
+; GISEL-NEXT:    v_mov_b32_e32 v18, s6
+; GISEL-NEXT:    v_mov_b32_e32 v17, s5
+; GISEL-NEXT:    v_mov_b32_e32 v16, s4
+; GISEL-NEXT:  .LBB3_9: ; %udiv-do-while
+; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[24:25], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v26, 31, v25
+; GISEL-NEXT:    v_lshl_b64 v[50:51], v[28:29], 1
+; GISEL-NEXT:    v_lshl_b64 v[30:31], v[30:31], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v28, 31, v29
+; GISEL-NEXT:    v_lshrrev_b32_e32 v29, 31, v23
+; GISEL-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
+; GISEL-NEXT:    v_add_i32_e32 v34, vcc, -1, v34
+; GISEL-NEXT:    v_addc_u32_e32 v35, vcc, -1, v35, vcc
+; GISEL-NEXT:    v_or_b32_e32 v24, v16, v18
+; GISEL-NEXT:    v_or_b32_e32 v25, v17, v19
+; GISEL-NEXT:    v_or_b32_e32 v18, v30, v28
+; GISEL-NEXT:    v_or_b32_e32 v19, v50, v29
+; GISEL-NEXT:    v_or_b32_e32 v22, v22, v26
+; GISEL-NEXT:    v_addc_u32_e32 v36, vcc, -1, v36, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v37, vcc, -1, v37, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v38, v19
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v39, v51, vcc
+; GISEL-NEXT:    v_or_b32_e32 v16, v34, v36
+; GISEL-NEXT:    v_or_b32_e32 v17, v35, v37
+; GISEL-NEXT:    v_subb_u32_e32 v26, vcc, v48, v18, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v26, vcc, v49, v31, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v16, 31, v26
+; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v26, 1, v16
+; GISEL-NEXT:    v_and_b32_e32 v17, v16, v12
+; GISEL-NEXT:    v_and_b32_e32 v29, v16, v13
+; GISEL-NEXT:    v_and_b32_e32 v30, v16, v14
+; GISEL-NEXT:    v_and_b32_e32 v50, v16, v15
+; GISEL-NEXT:    v_sub_i32_e32 v28, vcc, v19, v17
+; GISEL-NEXT:    v_subb_u32_e32 v29, vcc, v51, v29, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v16, v26
+; GISEL-NEXT:    v_mov_b32_e32 v17, v27
+; GISEL-NEXT:    v_subb_u32_e32 v30, vcc, v18, v30, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v31, vcc, v31, v50, vcc
+; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execnz .LBB3_9
+; GISEL-NEXT:  ; %bb.10: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB3_11: ; %Flow11
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_b64 v[26:27], v[24:25], 1
+; GISEL-NEXT:    v_lshl_b64 v[18:19], v[22:23], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 31, v25
+; GISEL-NEXT:    v_or_b32_e32 v18, v18, v22
+; GISEL-NEXT:    v_or_b32_e32 v24, v16, v26
+; GISEL-NEXT:    v_or_b32_e32 v25, v17, v27
+; GISEL-NEXT:  .LBB3_12: ; %Flow12
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
+; GISEL-NEXT:    v_mul_lo_u32 v28, v8, v21
+; GISEL-NEXT:    v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
+; GISEL-NEXT:    v_mul_lo_u32 v30, v12, v19
+; GISEL-NEXT:    v_mul_lo_u32 v31, v13, v18
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
+; GISEL-NEXT:    v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
+; GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
+; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
+; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v20
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v9, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = urem <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
diff --git a/llvm/test/Transforms/ExpandLargeDivRem/X86/vector.ll b/llvm/test/Transforms/ExpandLargeDivRem/X86/vector.ll
new file mode 100644
index 00000000000000..215ba3e36b6493
--- /dev/null
+++ b/llvm/test/Transforms/ExpandLargeDivRem/X86/vector.ll
@@ -0,0 +1,525 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=x86_64-- -expand-large-div-rem -expand-div-rem-bits 128 < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- -passes=expand-large-div-rem -expand-div-rem-bits 128 < %s | FileCheck %s
+
+define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
+; CHECK-LABEL: define <2 x i129> @sdiv129(
+; CHECK-SAME: <2 x i129> [[A:%.*]], <2 x i129> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  _udiv-special-cases_udiv-special-cases:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i129> [[B]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i129 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = ashr i129 [[TMP2]], 128
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr i129 [[TMP3]], 128
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i129 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i129 [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i129 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i129 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i129 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = freeze i129 [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i129 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i129 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP11]], i1 true)
+; CHECK-NEXT:    [[TMP17:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP12]], i1 true)
+; CHECK-NEXT:    [[TMP18:%.*]] = sub i129 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt i129 [[TMP18]], 128
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP15]], i1 true, i1 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i129 [[TMP18]], 128
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP20]], i129 0, i129 [[TMP12]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP21]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK:       udiv-loop-exit2:
+; CHECK-NEXT:    [[TMP24:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP39:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = phi i129 [ [[TMP48:%.*]], [[UDIV_BB15]] ], [ [[TMP36:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = shl i129 [[TMP25]], 1
+; CHECK-NEXT:    [[TMP27:%.*]] = or i129 [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    br label [[UDIV_END1]]
+; CHECK:       udiv-do-while3:
+; CHECK-NEXT:    [[TMP28:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER4:%.*]] ], [ [[TMP39]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = phi i129 [ [[TMP46:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP42:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i129 [ [[TMP44:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP41:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i129 [ [[TMP48]], [[UDIV_PREHEADER4]] ], [ [[TMP36]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = shl i129 [[TMP30]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = lshr i129 [[TMP31]], 128
+; CHECK-NEXT:    [[TMP34:%.*]] = or i129 [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = shl i129 [[TMP31]], 1
+; CHECK-NEXT:    [[TMP36]] = or i129 [[TMP28]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = sub i129 [[TMP45:%.*]], [[TMP34]]
+; CHECK-NEXT:    [[TMP38:%.*]] = ashr i129 [[TMP37]], 128
+; CHECK-NEXT:    [[TMP39]] = and i129 [[TMP38]], 1
+; CHECK-NEXT:    [[TMP40:%.*]] = and i129 [[TMP38]], [[TMP11]]
+; CHECK-NEXT:    [[TMP41]] = sub i129 [[TMP34]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42]] = add i129 [[TMP29]], -1
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i129 [[TMP42]], 0
+; CHECK-NEXT:    br i1 [[TMP43]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-preheader4:
+; CHECK-NEXT:    [[TMP44]] = lshr i129 [[TMP12]], [[TMP46]]
+; CHECK-NEXT:    [[TMP45]] = add i129 [[TMP11]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-bb15:
+; CHECK-NEXT:    [[TMP46]] = add i129 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP47:%.*]] = sub i129 128, [[TMP18]]
+; CHECK-NEXT:    [[TMP48]] = shl i129 [[TMP12]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i129 [[TMP46]], 0
+; CHECK-NEXT:    br i1 [[TMP49]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK:       udiv-end1:
+; CHECK-NEXT:    [[TMP50:%.*]] = phi i129 [ [[TMP27]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP22]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
+; CHECK-NEXT:    [[TMP51:%.*]] = xor i129 [[TMP50]], [[TMP10]]
+; CHECK-NEXT:    [[TMP52:%.*]] = sub i129 [[TMP51]], [[TMP10]]
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <2 x i129> poison, i129 [[TMP52]], i64 0
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[B]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = freeze i129 [[TMP54]]
+; CHECK-NEXT:    [[TMP57:%.*]] = freeze i129 [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = ashr i129 [[TMP56]], 128
+; CHECK-NEXT:    [[TMP59:%.*]] = ashr i129 [[TMP57]], 128
+; CHECK-NEXT:    [[TMP60:%.*]] = xor i129 [[TMP58]], [[TMP56]]
+; CHECK-NEXT:    [[TMP61:%.*]] = sub i129 [[TMP60]], [[TMP58]]
+; CHECK-NEXT:    [[TMP62:%.*]] = xor i129 [[TMP59]], [[TMP57]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub i129 [[TMP62]], [[TMP59]]
+; CHECK-NEXT:    [[TMP64:%.*]] = xor i129 [[TMP59]], [[TMP58]]
+; CHECK-NEXT:    [[TMP65:%.*]] = freeze i129 [[TMP63]]
+; CHECK-NEXT:    [[TMP66:%.*]] = freeze i129 [[TMP61]]
+; CHECK-NEXT:    [[TMP67:%.*]] = icmp eq i129 [[TMP65]], 0
+; CHECK-NEXT:    [[TMP68:%.*]] = icmp eq i129 [[TMP66]], 0
+; CHECK-NEXT:    [[TMP69:%.*]] = or i1 [[TMP67]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP65]], i1 true)
+; CHECK-NEXT:    [[TMP71:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP66]], i1 true)
+; CHECK-NEXT:    [[TMP72:%.*]] = sub i129 [[TMP70]], [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ugt i129 [[TMP72]], 128
+; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP69]], i1 true, i1 [[TMP73]]
+; CHECK-NEXT:    [[TMP75:%.*]] = icmp eq i129 [[TMP72]], 128
+; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP74]], i129 0, i129 [[TMP66]]
+; CHECK-NEXT:    [[TMP77:%.*]] = select i1 [[TMP74]], i1 true, i1 [[TMP75]]
+; CHECK-NEXT:    br i1 [[TMP77]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK:       udiv-loop-exit:
+; CHECK-NEXT:    [[TMP78:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP93:%.*]], [[UDIV_DO_WHILE:%.*]] ]
+; CHECK-NEXT:    [[TMP79:%.*]] = phi i129 [ [[TMP102:%.*]], [[UDIV_BB1]] ], [ [[TMP90:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP80:%.*]] = shl i129 [[TMP79]], 1
+; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP78]], [[TMP80]]
+; CHECK-NEXT:    br label [[UDIV_END]]
+; CHECK:       udiv-do-while:
+; CHECK-NEXT:    [[TMP82:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER:%.*]] ], [ [[TMP93]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP83:%.*]] = phi i129 [ [[TMP100:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP96:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP84:%.*]] = phi i129 [ [[TMP98:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP95:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP85:%.*]] = phi i129 [ [[TMP102]], [[UDIV_PREHEADER]] ], [ [[TMP90]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP86:%.*]] = shl i129 [[TMP84]], 1
+; CHECK-NEXT:    [[TMP87:%.*]] = lshr i129 [[TMP85]], 128
+; CHECK-NEXT:    [[TMP88:%.*]] = or i129 [[TMP86]], [[TMP87]]
+; CHECK-NEXT:    [[TMP89:%.*]] = shl i129 [[TMP85]], 1
+; CHECK-NEXT:    [[TMP90]] = or i129 [[TMP82]], [[TMP89]]
+; CHECK-NEXT:    [[TMP91:%.*]] = sub i129 [[TMP99:%.*]], [[TMP88]]
+; CHECK-NEXT:    [[TMP92:%.*]] = ashr i129 [[TMP91]], 128
+; CHECK-NEXT:    [[TMP93]] = and i129 [[TMP92]], 1
+; CHECK-NEXT:    [[TMP94:%.*]] = and i129 [[TMP92]], [[TMP65]]
+; CHECK-NEXT:    [[TMP95]] = sub i129 [[TMP88]], [[TMP94]]
+; CHECK-NEXT:    [[TMP96]] = add i129 [[TMP83]], -1
+; CHECK-NEXT:    [[TMP97:%.*]] = icmp eq i129 [[TMP96]], 0
+; CHECK-NEXT:    br i1 [[TMP97]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-preheader:
+; CHECK-NEXT:    [[TMP98]] = lshr i129 [[TMP66]], [[TMP100]]
+; CHECK-NEXT:    [[TMP99]] = add i129 [[TMP65]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-bb1:
+; CHECK-NEXT:    [[TMP100]] = add i129 [[TMP72]], 1
+; CHECK-NEXT:    [[TMP101:%.*]] = sub i129 128, [[TMP72]]
+; CHECK-NEXT:    [[TMP102]] = shl i129 [[TMP66]], [[TMP101]]
+; CHECK-NEXT:    [[TMP103:%.*]] = icmp eq i129 [[TMP100]], 0
+; CHECK-NEXT:    br i1 [[TMP103]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK:       udiv-end:
+; CHECK-NEXT:    [[TMP104:%.*]] = phi i129 [ [[TMP81]], [[UDIV_LOOP_EXIT]] ], [ [[TMP76]], [[UDIV_END1]] ]
+; CHECK-NEXT:    [[TMP105:%.*]] = xor i129 [[TMP104]], [[TMP64]]
+; CHECK-NEXT:    [[TMP106:%.*]] = sub i129 [[TMP105]], [[TMP64]]
+; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <2 x i129> [[TMP53]], i129 [[TMP106]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP107]]
+;
+  %res = sdiv <2 x i129> %a, %b
+  ret <2 x i129> %res
+}
+
+define <2 x i129> @udiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
+; CHECK-LABEL: define <2 x i129> @udiv129(
+; CHECK-SAME: <2 x i129> [[A:%.*]], <2 x i129> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  _udiv-special-cases_udiv-special-cases:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i129> [[B]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i129 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i129 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i129 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i129 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i129 [[TMP9]], 128
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], i1 true, i1 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i129 [[TMP9]], 128
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP11]], i129 0, i129 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP11]], i1 true, i1 [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK:       udiv-loop-exit2:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP30:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i129 [ [[TMP39:%.*]], [[UDIV_BB15]] ], [ [[TMP27:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = shl i129 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = or i129 [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    br label [[UDIV_END1]]
+; CHECK:       udiv-do-while3:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER4:%.*]] ], [ [[TMP30]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP37:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP33:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP35:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP32:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i129 [ [[TMP39]], [[UDIV_PREHEADER4]] ], [ [[TMP27]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl i129 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = lshr i129 [[TMP22]], 128
+; CHECK-NEXT:    [[TMP25:%.*]] = or i129 [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shl i129 [[TMP22]], 1
+; CHECK-NEXT:    [[TMP27]] = or i129 [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = sub i129 [[TMP36:%.*]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = ashr i129 [[TMP28]], 128
+; CHECK-NEXT:    [[TMP30]] = and i129 [[TMP29]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = and i129 [[TMP29]], [[TMP2]]
+; CHECK-NEXT:    [[TMP32]] = sub i129 [[TMP25]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33]] = add i129 [[TMP20]], -1
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i129 [[TMP33]], 0
+; CHECK-NEXT:    br i1 [[TMP34]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-preheader4:
+; CHECK-NEXT:    [[TMP35]] = lshr i129 [[TMP3]], [[TMP37]]
+; CHECK-NEXT:    [[TMP36]] = add i129 [[TMP2]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-bb15:
+; CHECK-NEXT:    [[TMP37]] = add i129 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP38:%.*]] = sub i129 128, [[TMP9]]
+; CHECK-NEXT:    [[TMP39]] = shl i129 [[TMP3]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i129 [[TMP37]], 0
+; CHECK-NEXT:    br i1 [[TMP40]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK:       udiv-end1:
+; CHECK-NEXT:    [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <2 x i129> poison, i129 [[TMP41]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i129> [[B]], i64 1
+; CHECK-NEXT:    [[TMP45:%.*]] = freeze i129 [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = freeze i129 [[TMP43]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i129 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i129 [[TMP46]], 0
+; CHECK-NEXT:    [[TMP49:%.*]] = or i1 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP45]], i1 true)
+; CHECK-NEXT:    [[TMP51:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP46]], i1 true)
+; CHECK-NEXT:    [[TMP52:%.*]] = sub i129 [[TMP50]], [[TMP51]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ugt i129 [[TMP52]], 128
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP49]], i1 true, i1 [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i129 [[TMP52]], 128
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i129 0, i129 [[TMP46]]
+; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP54]], i1 true, i1 [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP57]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK:       udiv-loop-exit:
+; CHECK-NEXT:    [[TMP58:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP73:%.*]], [[UDIV_DO_WHILE:%.*]] ]
+; CHECK-NEXT:    [[TMP59:%.*]] = phi i129 [ [[TMP82:%.*]], [[UDIV_BB1]] ], [ [[TMP70:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP60:%.*]] = shl i129 [[TMP59]], 1
+; CHECK-NEXT:    [[TMP61:%.*]] = or i129 [[TMP58]], [[TMP60]]
+; CHECK-NEXT:    br label [[UDIV_END]]
+; CHECK:       udiv-do-while:
+; CHECK-NEXT:    [[TMP62:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER:%.*]] ], [ [[TMP73]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP63:%.*]] = phi i129 [ [[TMP80:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP76:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP64:%.*]] = phi i129 [ [[TMP78:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP75:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP65:%.*]] = phi i129 [ [[TMP82]], [[UDIV_PREHEADER]] ], [ [[TMP70]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP66:%.*]] = shl i129 [[TMP64]], 1
+; CHECK-NEXT:    [[TMP67:%.*]] = lshr i129 [[TMP65]], 128
+; CHECK-NEXT:    [[TMP68:%.*]] = or i129 [[TMP66]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = shl i129 [[TMP65]], 1
+; CHECK-NEXT:    [[TMP70]] = or i129 [[TMP62]], [[TMP69]]
+; CHECK-NEXT:    [[TMP71:%.*]] = sub i129 [[TMP79:%.*]], [[TMP68]]
+; CHECK-NEXT:    [[TMP72:%.*]] = ashr i129 [[TMP71]], 128
+; CHECK-NEXT:    [[TMP73]] = and i129 [[TMP72]], 1
+; CHECK-NEXT:    [[TMP74:%.*]] = and i129 [[TMP72]], [[TMP45]]
+; CHECK-NEXT:    [[TMP75]] = sub i129 [[TMP68]], [[TMP74]]
+; CHECK-NEXT:    [[TMP76]] = add i129 [[TMP63]], -1
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp eq i129 [[TMP76]], 0
+; CHECK-NEXT:    br i1 [[TMP77]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-preheader:
+; CHECK-NEXT:    [[TMP78]] = lshr i129 [[TMP46]], [[TMP80]]
+; CHECK-NEXT:    [[TMP79]] = add i129 [[TMP45]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-bb1:
+; CHECK-NEXT:    [[TMP80]] = add i129 [[TMP52]], 1
+; CHECK-NEXT:    [[TMP81:%.*]] = sub i129 128, [[TMP52]]
+; CHECK-NEXT:    [[TMP82]] = shl i129 [[TMP46]], [[TMP81]]
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i129 [[TMP80]], 0
+; CHECK-NEXT:    br i1 [[TMP83]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK:       udiv-end:
+; CHECK-NEXT:    [[TMP84:%.*]] = phi i129 [ [[TMP61]], [[UDIV_LOOP_EXIT]] ], [ [[TMP56]], [[UDIV_END1]] ]
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <2 x i129> [[TMP42]], i129 [[TMP84]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP85]]
+;
+  %res = udiv <2 x i129> %a, %b
+  ret <2 x i129> %res
+}
+
+define <2 x i129> @srem129(<2 x i129> %a, <2 x i129> %b) nounwind {
+; CHECK-LABEL: define <2 x i129> @srem129(
+; CHECK-SAME: <2 x i129> [[A:%.*]], <2 x i129> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  _udiv-special-cases_udiv-special-cases:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i129> [[B]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i129 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = ashr i129 [[TMP2]], 128
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr i129 [[TMP3]], 128
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i129 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i129 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i129 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i129 [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze i129 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze i129 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = freeze i129 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = freeze i129 [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i129 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i129 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP12]], i1 true)
+; CHECK-NEXT:    [[TMP18:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP13]], i1 true)
+; CHECK-NEXT:    [[TMP19:%.*]] = sub i129 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt i129 [[TMP19]], 128
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP16]], i1 true, i1 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i129 [[TMP19]], 128
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP21]], i129 0, i129 [[TMP13]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP21]], i1 true, i1 [[TMP22]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK:       udiv-loop-exit2:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP40:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi i129 [ [[TMP49:%.*]], [[UDIV_BB15]] ], [ [[TMP37:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = shl i129 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = or i129 [[TMP25]], [[TMP27]]
+; CHECK-NEXT:    br label [[UDIV_END1]]
+; CHECK:       udiv-do-while3:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER4:%.*]] ], [ [[TMP40]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i129 [ [[TMP47:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP43:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i129 [ [[TMP45:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP42:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = phi i129 [ [[TMP49]], [[UDIV_PREHEADER4]] ], [ [[TMP37]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = shl i129 [[TMP31]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = lshr i129 [[TMP32]], 128
+; CHECK-NEXT:    [[TMP35:%.*]] = or i129 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = shl i129 [[TMP32]], 1
+; CHECK-NEXT:    [[TMP37]] = or i129 [[TMP29]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = sub i129 [[TMP46:%.*]], [[TMP35]]
+; CHECK-NEXT:    [[TMP39:%.*]] = ashr i129 [[TMP38]], 128
+; CHECK-NEXT:    [[TMP40]] = and i129 [[TMP39]], 1
+; CHECK-NEXT:    [[TMP41:%.*]] = and i129 [[TMP39]], [[TMP12]]
+; CHECK-NEXT:    [[TMP42]] = sub i129 [[TMP35]], [[TMP41]]
+; CHECK-NEXT:    [[TMP43]] = add i129 [[TMP30]], -1
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i129 [[TMP43]], 0
+; CHECK-NEXT:    br i1 [[TMP44]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-preheader4:
+; CHECK-NEXT:    [[TMP45]] = lshr i129 [[TMP13]], [[TMP47]]
+; CHECK-NEXT:    [[TMP46]] = add i129 [[TMP12]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-bb15:
+; CHECK-NEXT:    [[TMP47]] = add i129 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP48:%.*]] = sub i129 128, [[TMP19]]
+; CHECK-NEXT:    [[TMP49]] = shl i129 [[TMP13]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i129 [[TMP47]], 0
+; CHECK-NEXT:    br i1 [[TMP50]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK:       udiv-end1:
+; CHECK-NEXT:    [[TMP51:%.*]] = phi i129 [ [[TMP28]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP23]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
+; CHECK-NEXT:    [[TMP52:%.*]] = mul i129 [[TMP11]], [[TMP51]]
+; CHECK-NEXT:    [[TMP53:%.*]] = sub i129 [[TMP10]], [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = xor i129 [[TMP53]], [[TMP4]]
+; CHECK-NEXT:    [[TMP55:%.*]] = sub i129 [[TMP54]], [[TMP4]]
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <2 x i129> poison, i129 [[TMP55]], i64 0
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <2 x i129> [[B]], i64 1
+; CHECK-NEXT:    [[TMP59:%.*]] = freeze i129 [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = freeze i129 [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = ashr i129 [[TMP59]], 128
+; CHECK-NEXT:    [[TMP62:%.*]] = ashr i129 [[TMP60]], 128
+; CHECK-NEXT:    [[TMP63:%.*]] = xor i129 [[TMP59]], [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = xor i129 [[TMP60]], [[TMP62]]
+; CHECK-NEXT:    [[TMP65:%.*]] = sub i129 [[TMP63]], [[TMP61]]
+; CHECK-NEXT:    [[TMP66:%.*]] = sub i129 [[TMP64]], [[TMP62]]
+; CHECK-NEXT:    [[TMP67:%.*]] = freeze i129 [[TMP65]]
+; CHECK-NEXT:    [[TMP68:%.*]] = freeze i129 [[TMP66]]
+; CHECK-NEXT:    [[TMP69:%.*]] = freeze i129 [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = freeze i129 [[TMP67]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq i129 [[TMP69]], 0
+; CHECK-NEXT:    [[TMP72:%.*]] = icmp eq i129 [[TMP70]], 0
+; CHECK-NEXT:    [[TMP73:%.*]] = or i1 [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[TMP74:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP69]], i1 true)
+; CHECK-NEXT:    [[TMP75:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP70]], i1 true)
+; CHECK-NEXT:    [[TMP76:%.*]] = sub i129 [[TMP74]], [[TMP75]]
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp ugt i129 [[TMP76]], 128
+; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP73]], i1 true, i1 [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = icmp eq i129 [[TMP76]], 128
+; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP78]], i129 0, i129 [[TMP70]]
+; CHECK-NEXT:    [[TMP81:%.*]] = select i1 [[TMP78]], i1 true, i1 [[TMP79]]
+; CHECK-NEXT:    br i1 [[TMP81]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK:       udiv-loop-exit:
+; CHECK-NEXT:    [[TMP82:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP97:%.*]], [[UDIV_DO_WHILE:%.*]] ]
+; CHECK-NEXT:    [[TMP83:%.*]] = phi i129 [ [[TMP106:%.*]], [[UDIV_BB1]] ], [ [[TMP94:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP84:%.*]] = shl i129 [[TMP83]], 1
+; CHECK-NEXT:    [[TMP85:%.*]] = or i129 [[TMP82]], [[TMP84]]
+; CHECK-NEXT:    br label [[UDIV_END]]
+; CHECK:       udiv-do-while:
+; CHECK-NEXT:    [[TMP86:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER:%.*]] ], [ [[TMP97]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP87:%.*]] = phi i129 [ [[TMP104:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP100:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP88:%.*]] = phi i129 [ [[TMP102:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP99:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP89:%.*]] = phi i129 [ [[TMP106]], [[UDIV_PREHEADER]] ], [ [[TMP94]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP90:%.*]] = shl i129 [[TMP88]], 1
+; CHECK-NEXT:    [[TMP91:%.*]] = lshr i129 [[TMP89]], 128
+; CHECK-NEXT:    [[TMP92:%.*]] = or i129 [[TMP90]], [[TMP91]]
+; CHECK-NEXT:    [[TMP93:%.*]] = shl i129 [[TMP89]], 1
+; CHECK-NEXT:    [[TMP94]] = or i129 [[TMP86]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = sub i129 [[TMP103:%.*]], [[TMP92]]
+; CHECK-NEXT:    [[TMP96:%.*]] = ashr i129 [[TMP95]], 128
+; CHECK-NEXT:    [[TMP97]] = and i129 [[TMP96]], 1
+; CHECK-NEXT:    [[TMP98:%.*]] = and i129 [[TMP96]], [[TMP69]]
+; CHECK-NEXT:    [[TMP99]] = sub i129 [[TMP92]], [[TMP98]]
+; CHECK-NEXT:    [[TMP100]] = add i129 [[TMP87]], -1
+; CHECK-NEXT:    [[TMP101:%.*]] = icmp eq i129 [[TMP100]], 0
+; CHECK-NEXT:    br i1 [[TMP101]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-preheader:
+; CHECK-NEXT:    [[TMP102]] = lshr i129 [[TMP70]], [[TMP104]]
+; CHECK-NEXT:    [[TMP103]] = add i129 [[TMP69]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-bb1:
+; CHECK-NEXT:    [[TMP104]] = add i129 [[TMP76]], 1
+; CHECK-NEXT:    [[TMP105:%.*]] = sub i129 128, [[TMP76]]
+; CHECK-NEXT:    [[TMP106]] = shl i129 [[TMP70]], [[TMP105]]
+; CHECK-NEXT:    [[TMP107:%.*]] = icmp eq i129 [[TMP104]], 0
+; CHECK-NEXT:    br i1 [[TMP107]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK:       udiv-end:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi i129 [ [[TMP85]], [[UDIV_LOOP_EXIT]] ], [ [[TMP80]], [[UDIV_END1]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = mul i129 [[TMP68]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = sub i129 [[TMP67]], [[TMP109]]
+; CHECK-NEXT:    [[TMP111:%.*]] = xor i129 [[TMP110]], [[TMP61]]
+; CHECK-NEXT:    [[TMP112:%.*]] = sub i129 [[TMP111]], [[TMP61]]
+; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <2 x i129> [[TMP56]], i129 [[TMP112]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP113]]
+;
+  %res = srem <2 x i129> %a, %b
+  ret <2 x i129> %res
+}
+
+define <2 x i129> @urem129(<2 x i129> %a, <2 x i129> %b) nounwind {
+; CHECK-LABEL: define <2 x i129> @urem129(
+; CHECK-SAME: <2 x i129> [[A:%.*]], <2 x i129> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  _udiv-special-cases_udiv-special-cases:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i129> [[B]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i129 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i129 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze i129 [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i129 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i129 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i129 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i129 [[TMP11]], 128
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP8]], i1 true, i1 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i129 [[TMP11]], 128
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i129 0, i129 [[TMP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP13]], i1 true, i1 [[TMP14]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK:       udiv-loop-exit2:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP32:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i129 [ [[TMP41:%.*]], [[UDIV_BB15]] ], [ [[TMP29:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl i129 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    br label [[UDIV_END1]]
+; CHECK:       udiv-do-while3:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER4:%.*]] ], [ [[TMP32]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i129 [ [[TMP39:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP35:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i129 [ [[TMP37:%.*]], [[UDIV_PREHEADER4]] ], [ [[TMP34:%.*]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = phi i129 [ [[TMP41]], [[UDIV_PREHEADER4]] ], [ [[TMP29]], [[UDIV_DO_WHILE3]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = shl i129 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = lshr i129 [[TMP24]], 128
+; CHECK-NEXT:    [[TMP27:%.*]] = or i129 [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = shl i129 [[TMP24]], 1
+; CHECK-NEXT:    [[TMP29]] = or i129 [[TMP21]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sub i129 [[TMP38:%.*]], [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = ashr i129 [[TMP30]], 128
+; CHECK-NEXT:    [[TMP32]] = and i129 [[TMP31]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = and i129 [[TMP31]], [[TMP4]]
+; CHECK-NEXT:    [[TMP34]] = sub i129 [[TMP27]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35]] = add i129 [[TMP22]], -1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i129 [[TMP35]], 0
+; CHECK-NEXT:    br i1 [[TMP36]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-preheader4:
+; CHECK-NEXT:    [[TMP37]] = lshr i129 [[TMP5]], [[TMP39]]
+; CHECK-NEXT:    [[TMP38]] = add i129 [[TMP4]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE3]]
+; CHECK:       udiv-bb15:
+; CHECK-NEXT:    [[TMP39]] = add i129 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP40:%.*]] = sub i129 128, [[TMP11]]
+; CHECK-NEXT:    [[TMP41]] = shl i129 [[TMP5]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i129 [[TMP39]], 0
+; CHECK-NEXT:    br i1 [[TMP42]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK:       udiv-end1:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi i129 [ [[TMP20]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP15]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = mul i129 [[TMP3]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = sub i129 [[TMP2]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <2 x i129> poison, i129 [[TMP45]], i64 0
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i129> [[B]], i64 1
+; CHECK-NEXT:    [[TMP49:%.*]] = freeze i129 [[TMP47]]
+; CHECK-NEXT:    [[TMP50:%.*]] = freeze i129 [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = freeze i129 [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = freeze i129 [[TMP49]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp eq i129 [[TMP51]], 0
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i129 [[TMP52]], 0
+; CHECK-NEXT:    [[TMP55:%.*]] = or i1 [[TMP53]], [[TMP54]]
+; CHECK-NEXT:    [[TMP56:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP51]], i1 true)
+; CHECK-NEXT:    [[TMP57:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP52]], i1 true)
+; CHECK-NEXT:    [[TMP58:%.*]] = sub i129 [[TMP56]], [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp ugt i129 [[TMP58]], 128
+; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP55]], i1 true, i1 [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp eq i129 [[TMP58]], 128
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP60]], i129 0, i129 [[TMP52]]
+; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP60]], i1 true, i1 [[TMP61]]
+; CHECK-NEXT:    br i1 [[TMP63]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK:       udiv-loop-exit:
+; CHECK-NEXT:    [[TMP64:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP79:%.*]], [[UDIV_DO_WHILE:%.*]] ]
+; CHECK-NEXT:    [[TMP65:%.*]] = phi i129 [ [[TMP88:%.*]], [[UDIV_BB1]] ], [ [[TMP76:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP66:%.*]] = shl i129 [[TMP65]], 1
+; CHECK-NEXT:    [[TMP67:%.*]] = or i129 [[TMP64]], [[TMP66]]
+; CHECK-NEXT:    br label [[UDIV_END]]
+; CHECK:       udiv-do-while:
+; CHECK-NEXT:    [[TMP68:%.*]] = phi i129 [ 0, [[UDIV_PREHEADER:%.*]] ], [ [[TMP79]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP69:%.*]] = phi i129 [ [[TMP86:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP82:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP70:%.*]] = phi i129 [ [[TMP84:%.*]], [[UDIV_PREHEADER]] ], [ [[TMP81:%.*]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP71:%.*]] = phi i129 [ [[TMP88]], [[UDIV_PREHEADER]] ], [ [[TMP76]], [[UDIV_DO_WHILE]] ]
+; CHECK-NEXT:    [[TMP72:%.*]] = shl i129 [[TMP70]], 1
+; CHECK-NEXT:    [[TMP73:%.*]] = lshr i129 [[TMP71]], 128
+; CHECK-NEXT:    [[TMP74:%.*]] = or i129 [[TMP72]], [[TMP73]]
+; CHECK-NEXT:    [[TMP75:%.*]] = shl i129 [[TMP71]], 1
+; CHECK-NEXT:    [[TMP76]] = or i129 [[TMP68]], [[TMP75]]
+; CHECK-NEXT:    [[TMP77:%.*]] = sub i129 [[TMP85:%.*]], [[TMP74]]
+; CHECK-NEXT:    [[TMP78:%.*]] = ashr i129 [[TMP77]], 128
+; CHECK-NEXT:    [[TMP79]] = and i129 [[TMP78]], 1
+; CHECK-NEXT:    [[TMP80:%.*]] = and i129 [[TMP78]], [[TMP51]]
+; CHECK-NEXT:    [[TMP81]] = sub i129 [[TMP74]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82]] = add i129 [[TMP69]], -1
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i129 [[TMP82]], 0
+; CHECK-NEXT:    br i1 [[TMP83]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-preheader:
+; CHECK-NEXT:    [[TMP84]] = lshr i129 [[TMP52]], [[TMP86]]
+; CHECK-NEXT:    [[TMP85]] = add i129 [[TMP51]], -1
+; CHECK-NEXT:    br label [[UDIV_DO_WHILE]]
+; CHECK:       udiv-bb1:
+; CHECK-NEXT:    [[TMP86]] = add i129 [[TMP58]], 1
+; CHECK-NEXT:    [[TMP87:%.*]] = sub i129 128, [[TMP58]]
+; CHECK-NEXT:    [[TMP88]] = shl i129 [[TMP52]], [[TMP87]]
+; CHECK-NEXT:    [[TMP89:%.*]] = icmp eq i129 [[TMP86]], 0
+; CHECK-NEXT:    br i1 [[TMP89]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK:       udiv-end:
+; CHECK-NEXT:    [[TMP90:%.*]] = phi i129 [ [[TMP67]], [[UDIV_LOOP_EXIT]] ], [ [[TMP62]], [[UDIV_END1]] ]
+; CHECK-NEXT:    [[TMP91:%.*]] = mul i129 [[TMP50]], [[TMP90]]
+; CHECK-NEXT:    [[TMP92:%.*]] = sub i129 [[TMP49]], [[TMP91]]
+; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <2 x i129> [[TMP46]], i129 [[TMP92]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP93]]
+;
+  %res = urem <2 x i129> %a, %b
+  ret <2 x i129> %res
+}



More information about the llvm-commits mailing list