[llvm] [ExpandLargeDivRem] Scalarize vector types. (PR #86959)

via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 28 07:48:15 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-amdgpu

Author: Bevin Hansson (bevin-hansson)

<details>
<summary>Changes</summary>

expand-large-divrem cannot handle vector types.
If overly large vector element types survive into
isel, they will likely be scalarized there, but since
isel cannot handle scalar integer types of that size,
it will assert.

Handle vector types in expand-large-divrem by
scalarizing them and then expanding the scalar type
operation. For large vectors, this results in a
*massive* code expansion, but it's better than
asserting.


---

Patch is 189.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86959.diff


3 Files Affected:

- (modified) llvm/lib/CodeGen/ExpandLargeDivRem.cpp (+41-3) 
- (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+3228-5) 
- (added) llvm/test/Transforms/ExpandLargeDivRem/X86/vector.ll (+525) 


``````````diff
diff --git a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
index 973c814604b389..9a7cabacad7d2f 100644
--- a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
@@ -54,8 +54,34 @@ static bool isSigned(unsigned int Opcode) {
   return Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
 }
 
+static void scalarize(BinaryOperator *BO,
+                      SmallVectorImpl<BinaryOperator *> &Replace) {
+  VectorType *VTy = cast<VectorType>(BO->getType());
+  assert(!VTy->isScalableTy() && "Tried to scalarize scalable vector!");
+
+  IRBuilder<> Builder(BO);
+
+  unsigned NumElements = VTy->getElementCount().getKnownMinValue();
+  Value *Result = nullptr;
+  for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+    Value *LHS = Builder.CreateExtractElement(BO->getOperand(0), Idx);
+    Value *RHS = Builder.CreateExtractElement(BO->getOperand(1), Idx);
+    Value *Op = Builder.CreateBinOp(BO->getOpcode(), LHS, RHS);
+    Result = Builder.CreateInsertElement(
+        Result ? Result : PoisonValue::get(VTy), Op, Idx);
+    if (auto *NewBO = dyn_cast<BinaryOperator>(Op)) {
+      NewBO->copyIRFlags(Op, true);
+      Replace.push_back(NewBO);
+    }
+  }
+  BO->replaceAllUsesWith(Result);
+  BO->dropAllReferences();
+  BO->eraseFromParent();
+}
+
 static bool runImpl(Function &F, const TargetLowering &TLI) {
   SmallVector<BinaryOperator *, 4> Replace;
+  SmallVector<BinaryOperator *, 4> ReplaceVector;
   bool Modified = false;
 
   unsigned MaxLegalDivRemBitWidth = TLI.getMaxDivRemBitWidthSupported();
@@ -71,16 +97,23 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
     case Instruction::SDiv:
     case Instruction::URem:
     case Instruction::SRem: {
-      // TODO: This doesn't handle vectors.
-      auto *IntTy = dyn_cast<IntegerType>(I.getType());
+      // TODO: This pass doesn't handle scalable vectors.
+      if (I.getOperand(0)->getType()->isScalableTy())
+        continue;
+
+      auto *IntTy = dyn_cast<IntegerType>(I.getType()->getScalarType());
       if (!IntTy || IntTy->getIntegerBitWidth() <= MaxLegalDivRemBitWidth)
         continue;
 
       // The backend has peephole optimizations for powers of two.
+      // TODO: We don't consider vectors here.
       if (isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode())))
         continue;
 
-      Replace.push_back(&cast<BinaryOperator>(I));
+      if (I.getOperand(0)->getType()->isVectorTy())
+        ReplaceVector.push_back(&cast<BinaryOperator>(I));
+      else
+        Replace.push_back(&cast<BinaryOperator>(I));
       Modified = true;
       break;
     }
@@ -89,6 +122,11 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
     }
   }
 
+  while (!ReplaceVector.empty()) {
+    BinaryOperator *BO = ReplaceVector.pop_back_val();
+    scalarize(BO, Replace);
+  }
+
   if (Replace.empty())
     return false;
 
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 46e2632e45a190..16a03badcb1329 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -1,25 +1,3248 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
-
-; SDAG-ERR: LLVM ERROR: unsupported libcall legalization
-; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_v2i128_vv)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GISEL %s
 
 define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_sdiv_v2i128_vv:
+; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_ashrrev_i32_e32 v24, 31, v3
+; SDAG-NEXT:    v_ashrrev_i32_e32 v25, 31, v11
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    v_mov_b32_e32 v26, v24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v25
+; SDAG-NEXT:    v_xor_b32_e32 v17, v24, v3
+; SDAG-NEXT:    v_xor_b32_e32 v18, v24, v2
+; SDAG-NEXT:    v_xor_b32_e32 v1, v24, v1
+; SDAG-NEXT:    v_xor_b32_e32 v0, v24, v0
+; SDAG-NEXT:    v_xor_b32_e32 v19, v25, v11
+; SDAG-NEXT:    v_xor_b32_e32 v20, v25, v10
+; SDAG-NEXT:    v_xor_b32_e32 v9, v25, v9
+; SDAG-NEXT:    v_xor_b32_e32 v8, v25, v8
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v0, v24
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v1, v24, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v0, v2
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v18, v24, vcc
+; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], 32, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v3
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v17, v24, vcc
+; SDAG-NEXT:    v_or_b32_e32 v0, v2, v10
+; SDAG-NEXT:    v_ffbh_u32_e32 v17, v10
+; SDAG-NEXT:    v_min_u32_e32 v18, v1, v18
+; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, v8, v25
+; SDAG-NEXT:    v_or_b32_e32 v1, v3, v11
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], 32, v17
+; SDAG-NEXT:    v_ffbh_u32_e32 v17, v11
+; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 64, v18
+; SDAG-NEXT:    v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v9, v25, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v28
+; SDAG-NEXT:    v_min_u32_e32 v8, v8, v17
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, 0, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v20, v25, vcc
+; SDAG-NEXT:    v_add_i32_e64 v9, s[8:9], 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v29
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v8, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v19, v25, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v28, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v19, v0
+; SDAG-NEXT:    v_min_u32_e32 v20, v9, v20
+; SDAG-NEXT:    v_or_b32_e32 v9, v29, v1
+; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 32, v19
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v1
+; SDAG-NEXT:    v_add_i32_e32 v20, vcc, 64, v20
+; SDAG-NEXT:    v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_min_u32_e32 v8, v19, v21
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v22, 0, s[6:7]
+; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v8, s[6:7]
+; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v8, v18
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v9, v17, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v17, 0x7f, v8
+; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v16, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v16, vcc
+; SDAG-NEXT:    v_or_b32_e32 v16, v17, v18
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v9, v19
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v20
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v11, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v10, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v3, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v2, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_6
+; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v8
+; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v8
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[2:3], v20
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v18, v30, v32
+; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v8
+; SDAG-NEXT:    v_or_b32_e32 v19, v31, v33
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[10:11], v34
+; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v34
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[2:3], v34
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_lshr_b64 v[18:19], v[2:3], v35
+; SDAG-NEXT:    v_or_b32_e32 v9, v9, v19
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v18
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v21, v9, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v8, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v22, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_5
+; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT:    v_lshr_b64 v[16:17], v[2:3], v30
+; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
+; SDAG-NEXT:    v_lshr_b64 v[37:38], v[10:11], v30
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_mov_b32_e32 v23, 0
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; SDAG-NEXT:    v_lshl_b64 v[48:49], v[10:11], v35
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[10:11], v36
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v29, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v17, v49
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v48
+; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v11, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v10, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v38, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v37, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:  .LBB0_3: ; %udiv-do-while3
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v3
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v9
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v21
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v16
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v38
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v39
+; SDAG-NEXT:    v_or_b32_e32 v9, v19, v9
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v34, v2
+; SDAG-NEXT:    v_or_b32_e32 v8, v18, v8
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v35, v3, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v36, v10, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v37, v11, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v38, 31, v16
+; SDAG-NEXT:    v_and_b32_e32 v39, v38, v28
+; SDAG-NEXT:    v_and_b32_e32 v48, v38, v29
+; SDAG-NEXT:    v_and_b32_e32 v49, v38, v0
+; SDAG-NEXT:    v_and_b32_e32 v16, 1, v38
+; SDAG-NEXT:    v_and_b32_e32 v38, v38, v1
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v39
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v48, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v10, v49, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v38, vcc
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT:    v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT:    v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT:    v_or_b32_e32 v21, v23, v21
+; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT:    v_or_b32_e32 v20, v22, v20
+; SDAG-NEXT:    v_mov_b32_e32 v23, v17
+; SDAG-NEXT:    v_mov_b32_e32 v22, v16
+; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT:    s_cbranch_execnz .LBB0_3
+; SDAG-NEXT:  ; %bb.4: ; %Flow13
+; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT:  .LBB0_5: ; %Flow14
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[8:9], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v21
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v8
+; SDAG-NEXT:    v_or_b32_e32 v20, v19, v1
+; SDAG-NEXT:    v_or_b32_e32 v21, v17, v3
+; SDAG-NEXT:    v_or_b32_e32 v17, v18, v0
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v2
+; SDAG-NEXT:  .LBB0_6: ; %Flow16
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_ashrrev_i32_e32 v18, 31, v7
+; SDAG-NEXT:    v_ashrrev_i32_e32 v19, 31, v15
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    v_mov_b32_e32 v22, v18
+; SDAG-NEXT:    v_mov_b32_e32 v23, v19
+; SDAG-NEXT:    v_xor_b32_e32 v0, v18, v7
+; SDAG-NEXT:    v_xor_b32_e32 v1, v18, v6
+; SDAG-NEXT:    v_xor_b32_e32 v3, v18, v5
+; SDAG-NEXT:    v_xor_b32_e32 v2, v18, v4
+; SDAG-NEXT:    v_xor_b32_e32 v6, v19, v15
+; SDAG-NEXT:    v_xor_b32_e32 v7, v19, v14
+; SDAG-NEXT:    v_xor_b32_e32 v8, v19, v13
+; SDAG-NEXT:    v_xor_b32_e32 v10, v19, v12
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v18
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v18, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v2
+; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v1, v18, vcc
+; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v3
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v0, v18, vcc
+; SDAG-NEXT:    v_or_b32_e32 v0, v2, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v12, v4
+; SDAG-NEXT:    v_min_u32_e32 v11, v1, v11
+; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, v10, v19
+; SDAG-NEXT:    v_or_b32_e32 v1, v3, v5
+; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v12
+; SDAG-NEXT:    v_ffbh_u32_e32 v12, v5
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 64, v11
+; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v8, v19, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v28
+; SDAG-NEXT:    v_min_u32_e32 v8, v10, v12
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v13, 0, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v7, v19, vcc
+; SDAG-NEXT:    v_add_i32_e64 v7, s[8:9], 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v12, v29
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v11, v8, s[6:7]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v6, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v6, v28, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v0
+; SDAG-NEXT:    v_min_u32_e32 v12, v7, v12
+; SDAG-NEXT:    v_or_b32_e32 v7, v29, v1
+; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
+; SDAG-NEXT:    v_ffbh_u32_e32 v13, v1
+; SDAG-NEXT:    v_add_i32_e32 v12, vcc, 64, v12
+; SDAG-NEXT:    v_addc_u32_e64 v14, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_min_u32_e32 v6, v11, v13
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v14, 0, s[6:7]
+; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s[6:7]
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v10, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v10, 0x7f, v6
+; SDAG-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v8
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v11, v7, v9
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_and_b32_e32 v10, 1, v12
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v5, 0, s[4:5]
+; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v4, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v3, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_12
+; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v6
+; SDAG-NEXT:    v_sub_i32_e64 v12, s[4:5], 63, v6
+; SDAG-NEXT:    v_mov_b32_e32 v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v7, vcc
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[2:3], v12
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v8, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v7, v30, v32
+; SDAG-NEXT:    v_sub_i32_e32 v9, vcc, 0x7f, v6
+; SDAG-NEXT:    v_or_b32_e32 v8, v31, v33
+; SDAG-NEXT:    v_lshl_b64 v[14:15], v[4:5], v9
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, 64, v9
+; SDAG-NEXT:    v_lshl_b64 v[34:35], v[2:3], v9
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; SDAG-NEXT:    v_lshr_b64 v[6:7], v[2:3], v6
+; SDAG-NEXT:    v_or_b32_e32 v7, v15, v7
+; SDAG-NEXT:    v_or_b32_e32 v6, v14, v6
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v13, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v12, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v35, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v34, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v8, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v13, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_11
+; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[2:3], v30
+; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
+; SDAG-NEXT:    v_lshr_b64 v[37:38], v[4:5], v30
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v14, 0
+; SDAG-NEXT:    v_mov_b32_e32 v15, 0
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v13, 0
+; SDAG-NEXT:    v_lshl_b64 v[48:49], v[4:5], v35
+; SDAG-NEXT:    v_lshr_b64 v[4:5], v[4:5], v36
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v29, vcc
+; SDAG-NEXT:    v_or_b32_e32 v11, v11, v49
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v48
+; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v5, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v4, v10, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, v38, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v37, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
+; SDAG-NEXT:  .LBB0_9: ; %udiv-do-while
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 31, v3
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v9
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/86959


More information about the llvm-commits mailing list