[llvm] [ExpandIRInsts] Support saturating fptoi (PR #179710)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 4 09:37:57 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-transforms
Author: Nikita Popov (nikic)
<details>
<summary>Changes</summary>
Add support for expanding fptosi.sat and fptoui.sat via IR expansions. Similar to fptosi/fptoui we would get legalization errors otherwise.
The previous expansion for fptosi/fptoui was already saturating -- but those instructions do not actually require saturation, and the implementation of the saturation was incorrect in lots of ways. What this PR does is:
* For fptosi, remove the unnecessary saturation handling.
* For fptoui, remove the unnecessary saturation handling and sign multiplication.
* For fptosi, use the previous saturation handling with fixes: We need to map NaNs to 0 and the saturation condition on the exponent was incorrect. (I'm performing the NaN check via fcmp -- there's no requirement to do everything bitwise here.)
* For fptoui use a variation of the signed saturation handling: Negative values need to go to zero and we saturate to unsigned max.
Proofs: https://alive2.llvm.org/ce/z/Xv9FNd
---
Patch is 161.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/179710.diff
5 Files Affected:
- (modified) llvm/lib/CodeGen/ExpandIRInsts.cpp (+86-42)
- (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+469-1416)
- (modified) llvm/test/Transforms/ExpandIRInsts/X86/expand-fp-convert-small.ll (+103-75)
- (modified) llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll (+9-51)
- (modified) llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll (+9-75)
``````````diff
diff --git a/llvm/lib/CodeGen/ExpandIRInsts.cpp b/llvm/lib/CodeGen/ExpandIRInsts.cpp
index af8dc03ff8037..0e504beac78e2 100644
--- a/llvm/lib/CodeGen/ExpandIRInsts.cpp
+++ b/llvm/lib/CodeGen/ExpandIRInsts.cpp
@@ -40,6 +40,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
@@ -495,7 +496,7 @@ static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
/// }
///
/// Replace fp to integer with generated code.
-static void expandFPToI(Instruction *FPToI) {
+static void expandFPToI(Instruction *FPToI, bool IsSaturating, bool IsSigned) {
// clang-format on
IRBuilder<> Builder(FPToI);
auto *FloatVal = FPToI->getOperand(0);
@@ -537,12 +538,15 @@ static void expandFPToI(Instruction *FPToI) {
BasicBlock *Entry = Builder.GetInsertBlock();
Function *F = Entry->getParent();
Entry->setName(Twine(Entry->getName(), "fp-to-i-entry"));
+ BasicBlock *CheckSaturateBB, *SaturateBB;
BasicBlock *End =
Entry->splitBasicBlock(Builder.GetInsertPoint(), "fp-to-i-cleanup");
- BasicBlock *CheckSaturateBB = BasicBlock::Create(
- Builder.getContext(), "fp-to-i-if-check.saturate", F, End);
- BasicBlock *SaturateBB =
- BasicBlock::Create(Builder.getContext(), "fp-to-i-if-saturate", F, End);
+ if (IsSaturating) {
+ CheckSaturateBB = BasicBlock::Create(Builder.getContext(),
+ "fp-to-i-if-check.saturate", F, End);
+ SaturateBB =
+ BasicBlock::Create(Builder.getContext(), "fp-to-i-if-saturate", F, End);
+ }
BasicBlock *CheckExpSizeBB = BasicBlock::Create(
Builder.getContext(), "fp-to-i-if-check.exp.size", F, End);
BasicBlock *ExpSmallBB =
@@ -563,40 +567,56 @@ static void expandFPToI(Instruction *FPToI) {
FloatVal =
Builder.CreateFPExt(FloatVal, Type::getFP128Ty(Builder.getContext()));
Value *ARep = Builder.CreateBitCast(FloatVal, FloatIntTy);
- Value *PosOrNeg =
- Builder.CreateICmpSGT(ARep, ConstantInt::getSigned(FloatIntTy, -1));
- Value *Sign = Builder.CreateSelect(PosOrNeg, ConstantInt::getSigned(IntTy, 1),
- ConstantInt::getSigned(IntTy, -1), "sign");
+ Value *PosOrNeg, *Sign;
+ if (IsSigned) {
+ PosOrNeg =
+ Builder.CreateICmpSGT(ARep, ConstantInt::getSigned(FloatIntTy, -1));
+ Sign = Builder.CreateSelect(PosOrNeg, ConstantInt::getSigned(IntTy, 1),
+ ConstantInt::getSigned(IntTy, -1), "sign");
+ }
Value *And =
Builder.CreateLShr(ARep, Builder.getIntN(FloatWidth, FPMantissaWidth));
Value *BiasedExp = Builder.CreateAnd(
And, Builder.getIntN(FloatWidth, (1 << ExponentWidth) - 1), "biased.exp");
Value *Abs = Builder.CreateAnd(ARep, SignificandMask);
Value *Significand = Builder.CreateOr(Abs, ImplicitBit, "significand");
- Value *ExpIsNegative = Builder.CreateICmpULT(
+ Value *ZeroResultCond = Builder.CreateICmpULT(
BiasedExp, Builder.getIntN(FloatWidth, ExponentBias), "exp.is.negative");
- Builder.CreateCondBr(ExpIsNegative, End, CheckSaturateBB);
-
- // check.saturate:
- Builder.SetInsertPoint(CheckSaturateBB);
- Value *Add1 = Builder.CreateAdd(
- BiasedExp,
- ConstantInt::getSigned(FloatIntTy,
- -static_cast<int64_t>(ExponentBias + BitWidth)));
- Value *Cmp3 = Builder.CreateICmpULT(
- Add1,
- ConstantInt::getSigned(FloatIntTy, -static_cast<int64_t>(BitWidth)));
- Builder.CreateCondBr(Cmp3, SaturateBB, CheckExpSizeBB);
-
- // saturate:
- Builder.SetInsertPoint(SaturateBB);
- Value *SignedMax =
- ConstantInt::get(IntTy, APInt::getSignedMaxValue(BitWidth));
- Value *SignedMin =
- ConstantInt::get(IntTy, APInt::getSignedMinValue(BitWidth));
- Value *Saturated =
- Builder.CreateSelect(PosOrNeg, SignedMax, SignedMin, "saturated");
- Builder.CreateBr(End);
+ if (IsSaturating) {
+ Value *IsNaN = Builder.CreateFCmpUNO(FloatVal, FloatVal, "is.nan");
+ ZeroResultCond = Builder.CreateOr(ZeroResultCond, IsNaN);
+ if (!IsSigned) {
+ Value *IsNeg = Builder.CreateIsNeg(ARep);
+ ZeroResultCond = Builder.CreateOr(ZeroResultCond, IsNeg);
+ }
+ }
+ Builder.CreateCondBr(ZeroResultCond, End,
+ IsSaturating ? CheckSaturateBB : CheckExpSizeBB);
+
+ Value *Saturated;
+ if (IsSaturating) {
+ // check.saturate:
+ Builder.SetInsertPoint(CheckSaturateBB);
+ Value *Cmp3 = Builder.CreateICmpUGE(
+ BiasedExp, ConstantInt::getSigned(
+ FloatIntTy, static_cast<int64_t>(ExponentBias +
+ BitWidth - IsSigned)));
+ Builder.CreateCondBr(Cmp3, SaturateBB, CheckExpSizeBB);
+
+ // saturate:
+ Builder.SetInsertPoint(SaturateBB);
+ if (IsSigned) {
+ Value *SignedMax =
+ ConstantInt::get(IntTy, APInt::getSignedMaxValue(BitWidth));
+ Value *SignedMin =
+ ConstantInt::get(IntTy, APInt::getSignedMinValue(BitWidth));
+ Saturated =
+ Builder.CreateSelect(PosOrNeg, SignedMax, SignedMin, "saturated");
+ } else {
+ Saturated = ConstantInt::getAllOnesValue(IntTy);
+ }
+ Builder.CreateBr(End);
+ }
// if.end9:
Builder.SetInsertPoint(CheckExpSizeBB);
@@ -609,9 +629,10 @@ static void expandFPToI(Instruction *FPToI) {
Builder.SetInsertPoint(ExpSmallBB);
Value *Sub13 = Builder.CreateSub(
Builder.getIntN(FloatWidth, ExponentBias + FPMantissaWidth), BiasedExp);
- Value *Shr14 =
+ Value *ExpSmallRes =
Builder.CreateZExtOrTrunc(Builder.CreateLShr(Significand, Sub13), IntTy);
- Value *Mul = Builder.CreateMul(Shr14, Sign);
+ if (IsSigned)
+ ExpSmallRes = Builder.CreateMul(ExpSmallRes, Sign);
Builder.CreateBr(End);
// exp.large:
@@ -621,18 +642,20 @@ static void expandFPToI(Instruction *FPToI) {
ConstantInt::getSigned(
FloatIntTy, -static_cast<int64_t>(ExponentBias + FPMantissaWidth)));
Value *SignificandCast = Builder.CreateZExtOrTrunc(Significand, IntTy);
- Value *Shl = Builder.CreateShl(SignificandCast,
- Builder.CreateZExtOrTrunc(Sub15, IntTy));
- Value *Mul16 = Builder.CreateMul(Shl, Sign);
+ Value *ExpLargeRes = Builder.CreateShl(
+ SignificandCast, Builder.CreateZExtOrTrunc(Sub15, IntTy));
+ if (IsSigned)
+ ExpLargeRes = Builder.CreateMul(ExpLargeRes, Sign);
Builder.CreateBr(End);
// cleanup:
Builder.SetInsertPoint(End, End->begin());
- PHINode *Retval0 = Builder.CreatePHI(FPToI->getType(), 4);
+ PHINode *Retval0 = Builder.CreatePHI(FPToI->getType(), 3 + IsSaturating);
- Retval0->addIncoming(Saturated, SaturateBB);
- Retval0->addIncoming(Mul, ExpSmallBB);
- Retval0->addIncoming(Mul16, ExpLargeBB);
+ if (IsSaturating)
+ Retval0->addIncoming(Saturated, SaturateBB);
+ Retval0->addIncoming(ExpSmallRes, ExpSmallBB);
+ Retval0->addIncoming(ExpLargeRes, ExpLargeBB);
Retval0->addIncoming(Builder.getIntN(BitWidth, 0), Entry);
FPToI->replaceAllUsesWith(Retval0);
@@ -1080,6 +1103,16 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
// The backend has peephole optimizations for powers of two.
// TODO: We don't consider vectors here.
&& !isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode()));
+ case Instruction::Call: {
+ auto *II = dyn_cast<IntrinsicInst>(&I);
+ if (II && (II->getIntrinsicID() == Intrinsic::fptoui_sat ||
+ II->getIntrinsicID() == Intrinsic::fptosi_sat)) {
+ return !DisableExpandLargeFp &&
+ cast<IntegerType>(Ty->getScalarType())->getIntegerBitWidth() >
+ MaxLegalFpConvertBitWidth;
+ }
+ return false;
+ }
}
return false;
@@ -1115,8 +1148,10 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
}
case Instruction::FPToUI:
+ expandFPToI(I, /*IsSaturating=*/false, /*IsSigned=*/false);
+ break;
case Instruction::FPToSI:
- expandFPToI(I);
+ expandFPToI(I, /*IsSaturating=*/false, /*IsSigned=*/true);
break;
case Instruction::UIToFP:
@@ -1132,6 +1167,15 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
case Instruction::SRem:
expandRemainder(cast<BinaryOperator>(I));
break;
+
+ case Instruction::Call: {
+ auto *II = cast<IntrinsicInst>(I);
+ assert(II->getIntrinsicID() == Intrinsic::fptoui_sat ||
+ II->getIntrinsicID() == Intrinsic::fptosi_sat);
+ expandFPToI(I, /*IsSaturating=*/true,
+ /*IsSigned=*/II->getIntrinsicID() == Intrinsic::fptosi_sat);
+ break;
+ }
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 200fbf5d220b4..472e79f58969f 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -16,105 +16,79 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: v_mov_b32_e32 v2, 0
; SDAG-NEXT: v_mov_b32_e32 v1, 0
; SDAG-NEXT: v_mov_b32_e32 v3, 0
-; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; SDAG-NEXT: s_cbranch_execz .LBB0_10
-; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-check.saturate
-; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
-; SDAG-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
-; SDAG-NEXT: s_movk_i32 s6, 0xff7f
-; SDAG-NEXT: s_mov_b32 s7, -1
-; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
-; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7]
-; SDAG-NEXT: s_cbranch_execz .LBB0_7
-; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-check.exp.size
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0
-; SDAG-NEXT: s_mov_b64 s[6:7], 0x432
+; SDAG-NEXT: s_cbranch_execz .LBB0_6
+; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-check.exp.size
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x432
+; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v5
; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5
-; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5]
+; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT: v_or_b32_e32 v9, 1, v8
; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
-; SDAG-NEXT: s_cbranch_execz .LBB0_4
-; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-exp.large
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB0_3
+; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-exp.large
; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6
; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6
; SDAG-NEXT: v_add_u32_e32 v3, 0xfffffbcd, v6
; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
; SDAG-NEXT: v_lshlrev_b64 v[6:7], v2, v[4:5]
; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3
; SDAG-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[6:7]
-; SDAG-NEXT: v_mul_lo_u32 v12, v10, v1
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v10, 0
+; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v11, v9, v1
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v9, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v4, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v13, v10, v[1:2]
-; SDAG-NEXT: v_mul_lo_u32 v11, v8, v5
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v10, v5, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v4, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v9, v[1:2]
+; SDAG-NEXT: v_mul_lo_u32 v10, v8, v5
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v5, 0
; SDAG-NEXT: v_mov_b32_e32 v1, v3
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[1:2]
-; SDAG-NEXT: v_add3_u32 v6, v6, v12, v11
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v9, v7, v[5:6]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v8, v[1:2]
+; SDAG-NEXT: v_add3_u32 v6, v6, v11, v10
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v7, v[5:6]
; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc
-; SDAG-NEXT: v_mul_lo_u32 v10, v9, v13
-; SDAG-NEXT: v_mul_lo_u32 v7, v9, v7
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v13, v8, v[2:3]
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_mul_lo_u32 v9, v8, v12
+; SDAG-NEXT: v_mul_lo_u32 v7, v8, v7
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v8, v[2:3]
; SDAG-NEXT: ; implicit-def: $vgpr8
-; SDAG-NEXT: ; implicit-def: $vgpr9
-; SDAG-NEXT: v_add3_u32 v4, v7, v6, v10
+; SDAG-NEXT: v_add3_u32 v4, v7, v6, v9
; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; SDAG-NEXT: ; implicit-def: $vgpr10
-; SDAG-NEXT: .LBB0_4: ; %Flow
-; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
-; SDAG-NEXT: s_cbranch_execz .LBB0_6
-; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-exp.small
+; SDAG-NEXT: ; implicit-def: $vgpr9
+; SDAG-NEXT: .LBB0_3: ; %Flow
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9]
+; SDAG-NEXT: s_cbranch_execz .LBB0_5
+; SDAG-NEXT: ; %bb.4: ; %fp-to-i-if-exp.small
; SDAG-NEXT: v_sub_u32_e32 v0, 0x433, v6
; SDAG-NEXT: v_lshrrev_b64 v[4:5], v0, v[4:5]
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v4, v10, 0
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[12:13], v5, v10, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v5, v9, v[1:2]
; SDAG-NEXT: v_mov_b32_e32 v1, v6
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v4, v8, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v4, v8, v[1:2]
; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[12:13], 0, 0, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v5, v8, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v9, v4, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v9, v4, v[3:4]
-; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3
-; SDAG-NEXT: .LBB0_6: ; %Flow1
-; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: .LBB0_7: ; %Flow2
-; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11]
-; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-saturate
-; SDAG-NEXT: v_bfrev_b32_e32 v0, 1
-; SDAG-NEXT: v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v3, v2
-; SDAG-NEXT: v_mov_b32_e32 v0, v1
-; SDAG-NEXT: v_mov_b32_e32 v2, v1
-; SDAG-NEXT: ; %bb.9: ; %Flow3
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[8:9], 0, 0, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[2:3]
+; SDAG-NEXT: v_mul_lo_u32 v5, v8, v5
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v8, v4, v[2:3]
+; SDAG-NEXT: v_mul_lo_u32 v4, v8, v4
+; SDAG-NEXT: v_add3_u32 v3, v4, v3, v5
+; SDAG-NEXT: .LBB0_5: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB0_6: ; %fp-to-i-cleanup
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup
-; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: fptosi_f64_to_i128:
@@ -134,24 +108,14 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_mov_b32_e32 v1, s5
; GISEL-NEXT: v_mov_b32_e32 v2, s6
; GISEL-NEXT: v_mov_b32_e32 v3, s7
-; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
-; GISEL-NEXT: s_cbranch_execz .LBB0_10
-; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-check.saturate
-; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
-; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v3, -1
-; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
-; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
-; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB0_7
-; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-check.exp.size
-; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB0_6
+; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-check.exp.size
+; GISEL-NEXT: v_cmp_ge_i64_e32 vcc, -1, v[4:5]
+; GISEL-NEXT: s_mov_b32 s4, 0x100000
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
@@ -205,137 +169,62 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_mov_b32_e32 v0, 0x433
; GISEL-NEXT: v_mov_b32_e32 v1, 0
; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff
-; GISEL-NEXT: s_mov_b32 s6, 0x100000
; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
-; GISEL-NEXT: v_and_or_b32 v5, v5, v2, s6
+; GISEL-NEXT: v_and_or_b32 v5, v5, v2, s4
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB0_4
-; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-exp.large
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB0_3
+; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-exp.large
; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6
; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2
; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v11, v9, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v12, v8, v[5:6]
; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v10, v9, v[1:2]
; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[5:6]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v10, s[8:9]
+; GISEL-NEXT: v_addc_c...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/179710
More information about the llvm-commits
mailing list