[llvm] [AMDGPU] expand-fp: unify scalarization (NFC) (PR #158588)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 15 02:19:19 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Frederik Harwath (frederik-h)
<details>
<summary>Changes</summary>
Extend the existing "scalarize" function which is used for the
fp-integer conversion instruction expansion to BinaryOperator instructions
and reuse it for the frem expansion; a similar function for scalarizing BinaryOperator
instructions exists in the ExpandLargeDivRem pass and this change is a step towards
merging that pass with ExpandFp.
Further refactoring: Extract a function to dispatch instructions to the scalar and
vector queues and hoist a check for scalable vectors to the top of the
instruction visiting loop.
---
Patch is 383.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158588.diff
4 Files Affected:
- (modified) llvm/lib/CodeGen/ExpandFp.cpp (+45-65)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll (+140-140)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+650-650)
- (modified) llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll (+140-140)
``````````diff
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 9cc6c6a706c58..e336da7b914f0 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -356,8 +356,9 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y,
static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
- Type *ReturnTy = I.getType();
- assert(FRemExpander::canExpandType(ReturnTy->getScalarType()));
+ Type *Ty = I.getType();
+ assert(Ty->isFloatingPointTy() && "Instruction should have been scalarized");
+ assert(FRemExpander::canExpandType(Ty));
FastMathFlags FMF = I.getFastMathFlags();
// TODO Make use of those flags for optimization?
@@ -368,32 +369,10 @@ static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
B.setFastMathFlags(FMF);
B.SetCurrentDebugLocation(I.getDebugLoc());
- Type *ElemTy = ReturnTy->getScalarType();
- const FRemExpander Expander = FRemExpander::create(B, ElemTy);
-
- Value *Ret;
- if (ReturnTy->isFloatingPointTy())
- Ret = FMF.approxFunc()
- ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
- : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
- else {
- auto *VecTy = cast<FixedVectorType>(ReturnTy);
-
- // This could use SplitBlockAndInsertForEachLane but the interface
- // is a bit awkward for a constant number of elements and it will
- // boil down to the same code.
- // TODO Expand the FRem instruction only once and reuse the code.
- Value *Nums = I.getOperand(0);
- Value *Denums = I.getOperand(1);
- Ret = PoisonValue::get(I.getType());
- for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
- Value *Num = B.CreateExtractElement(Nums, I);
- Value *Denum = B.CreateExtractElement(Denums, I);
- Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum)
- : Expander.buildFRem(Num, Denum, SQ);
- Ret = B.CreateInsertElement(Ret, Rem, I);
- }
- }
+ const FRemExpander Expander = FRemExpander::create(B, Ty);
+ Value *Ret = FMF.approxFunc()
+ ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
+ : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
I.replaceAllUsesWith(Ret);
Ret->takeName(&I);
@@ -948,12 +927,21 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
Value *Result = PoisonValue::get(VTy);
for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
Value *Ext = Builder.CreateExtractElement(I->getOperand(0), Idx);
- Value *Cast = Builder.CreateCast(cast<CastInst>(I)->getOpcode(), Ext,
- I->getType()->getScalarType());
- Result = Builder.CreateInsertElement(Result, Cast, Idx);
- if (isa<Instruction>(Cast))
- Replace.push_back(cast<Instruction>(Cast));
+ Value *Op;
+ if (isa<BinaryOperator>(I))
+ Op = Builder.CreateBinOp(
+ cast<BinaryOperator>(I)->getOpcode(), Ext,
+ Builder.CreateExtractElement(I->getOperand(1), Idx));
+ else
+ Op = Builder.CreateCast(cast<CastInst>(I)->getOpcode(), Ext,
+ I->getType()->getScalarType());
+ Result = Builder.CreateInsertElement(Result, Op, Idx);
+ if (auto *ScalarizedI = dyn_cast<Instruction>(Op)) {
+ ScalarizedI->copyIRFlags(I, true);
+ Replace.push_back(ScalarizedI);
+ }
}
+
I->replaceAllUsesWith(Result);
I->dropAllReferences();
I->eraseFromParent();
@@ -989,6 +977,16 @@ static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
}
+static void enqueueInstruction(Instruction &I,
+ SmallVector<Instruction *, 4> &Replace,
+ SmallVector<Instruction *, 4> &ReplaceVector) {
+
+ if (I.getOperand(0)->getType()->isVectorTy())
+ ReplaceVector.push_back(&I);
+ else
+ Replace.push_back(&I);
+}
+
static bool runImpl(Function &F, const TargetLowering &TLI,
AssumptionCache *AC) {
SmallVector<Instruction *, 4> Replace;
@@ -1004,55 +1002,37 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
return false;
for (auto &I : instructions(F)) {
- switch (I.getOpcode()) {
- case Instruction::FRem: {
- Type *Ty = I.getType();
- // TODO: This pass doesn't handle scalable vectors.
- if (Ty->isScalableTy())
- continue;
-
- if (targetSupportsFrem(TLI, Ty) ||
- !FRemExpander::canExpandType(Ty->getScalarType()))
- continue;
-
- Replace.push_back(&I);
- Modified = true;
+ Type *Ty = I.getType();
+ // TODO: This pass doesn't handle scalable vectors.
+ if (Ty->isScalableTy())
+ continue;
+ switch (I.getOpcode()) {
+ case Instruction::FRem:
+ if (!targetSupportsFrem(TLI, Ty) &&
+ FRemExpander::canExpandType(Ty->getScalarType())) {
+ enqueueInstruction(I, Replace, ReplaceVector);
+ Modified = true;
+ }
break;
- }
case Instruction::FPToUI:
case Instruction::FPToSI: {
- // TODO: This pass doesn't handle scalable vectors.
- if (I.getOperand(0)->getType()->isScalableTy())
- continue;
-
- auto *IntTy = cast<IntegerType>(I.getType()->getScalarType());
+ auto *IntTy = cast<IntegerType>(Ty->getScalarType());
if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
continue;
- if (I.getOperand(0)->getType()->isVectorTy())
- ReplaceVector.push_back(&I);
- else
- Replace.push_back(&I);
+ enqueueInstruction(I, Replace, ReplaceVector);
Modified = true;
break;
}
case Instruction::UIToFP:
case Instruction::SIToFP: {
- // TODO: This pass doesn't handle scalable vectors.
- if (I.getOperand(0)->getType()->isScalableTy())
- continue;
-
auto *IntTy =
cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
continue;
- if (I.getOperand(0)->getType()->isVectorTy())
- ReplaceVector.push_back(&I);
- else
- Replace.push_back(&I);
- Modified = true;
+ enqueueInstruction(I, Replace, ReplaceVector);
break;
}
default:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 302b2395642d0..c87cfbdfe87b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -1048,7 +1048,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; CI-NEXT: s_cbranch_vccz .LBB9_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else20
; CI-NEXT: s_and_b32 s2, s0, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -1059,7 +1059,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB9_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute19
; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
@@ -1084,10 +1084,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB9_5: ; %frem.loop_body
+; CI-NEXT: .LBB9_5: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1103,7 +1103,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_7
; CI-NEXT: .LBB9_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB9_7: ; %frem.loop_exit
+; CI-NEXT: .LBB9_7: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1126,7 +1126,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; CI-NEXT: s_cbranch_vccz .LBB9_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: s_and_b32 s4, s2, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -1137,7 +1137,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s4, s4, 1
; CI-NEXT: s_cmp_lg_u32 s4, 0
; CI-NEXT: s_cbranch_scc1 .LBB9_16
-; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: ; %bb.11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
@@ -1162,10 +1162,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB9_13: ; %frem.loop_body27
+; CI-NEXT: .LBB9_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1181,7 +1181,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_15
; CI-NEXT: .LBB9_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB9_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1239,7 +1239,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; VI-NEXT: s_cbranch_vccz .LBB9_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else20
; VI-NEXT: s_and_b32 s2, s0, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -1250,7 +1250,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB9_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute19
; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; VI-NEXT: v_ldexp_f32 v1, v3, 1
@@ -1275,10 +1275,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB9_5: ; %frem.loop_body
+; VI-NEXT: .LBB9_5: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1294,7 +1294,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_7
; VI-NEXT: .LBB9_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB9_7: ; %frem.loop_exit
+; VI-NEXT: .LBB9_7: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1317,7 +1317,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; VI-NEXT: s_cbranch_vccz .LBB9_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_and_b32 s3, s4, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1328,7 +1328,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s3, s3, 1
; VI-NEXT: s_cmp_lg_u32 s3, 0
; VI-NEXT: s_cbranch_scc1 .LBB9_16
-; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: ; %bb.11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; VI-NEXT: v_ldexp_f32 v2, v4, 1
@@ -1353,10 +1353,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB9_13: ; %frem.loop_body27
+; VI-NEXT: .LBB9_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1372,7 +1372,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_15
; VI-NEXT: .LBB9_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB9_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1427,7 +1427,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; CI-NEXT: s_cbranch_vccz .LBB10_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else86
; CI-NEXT: s_and_b32 s0, s4, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1438,7 +1438,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s0, s0, 1
; CI-NEXT: s_cmp_lg_u32 s0, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute85
; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
@@ -1463,10 +1463,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB10_5: ; %frem.loop_body
+; CI-NEXT: .LBB10_5: ; %frem.loop_body93
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1482,7 +1482,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_7
; CI-NEXT: .LBB10_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB10_7: ; %frem.loop_exit
+; CI-NEXT: .LBB10_7: ; %frem.loop_exit94
; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1505,7 +1505,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; CI-NEXT: s_cbranch_vccz .LBB10_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else53
; CI-NEXT: s_and_b32 s1, s6, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1516,7 +1516,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_16
-; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: ; %bb.11: ; %frem.compute52
; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
@@ -1541,10 +1541,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB10_13: ; %frem.loop_body27
+; CI-NEXT: .LBB10_13: ; %frem.loop_body60
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1560,7 +1560,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_15
; CI-NEXT: .LBB10_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB10_15: ; %frem.loop_exit61
; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1581,7 +1581,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr2
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; CI-NEXT: s_cbranch_vccz .LBB10_18
-; CI-NEXT: ; %bb.17: ; %frem.else53
+; CI-NEXT: ; %bb.17: ; %frem.else20
; CI-NEXT: s_and_b32 s1, s5, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
; CI-NEXT: v_mov_b32_e32 v2, s1
@@ -1592,7 +1592,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_24
-; CI-NEXT: ; %bb.19: ; %frem.compute52
+; CI-NEXT: ; %bb.19: ; %frem.compute19
; CI-NEXT: v_frexp_mant_f32_e32 v5, v3
; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1
@@ -1617,10 +1617,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_22
-; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7
; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
-; CI-NEXT: .LBB10_21: ; %frem.loop_body60
+; CI-NEXT: .LBB10_21: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v7, v6
; CI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -1636,7 +1636,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_23
; CI-NEXT: .LBB10_22:
; CI-NEXT: v_mov_b32_e32 v7, v6
-; CI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; CI-NEXT: .LBB10_23: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4
; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
; CI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -1659,7 +1659,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr3
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
; CI-NEXT: s_cbranch_vccz .LBB10_26
-; CI-NEXT: ; %bb.25: ; %frem.else86
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: s_and_b32 s1, s7, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1670,7 +1670,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/158588
More information about the llvm-commits
mailing list