[llvm] expand-fp: Refactor modification status handling (NFC) (PR #163542)
Frederik Harwath via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 19 22:58:48 PDT 2025
https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/163542
>From d929c74d4a3f70b4725bdd7f350f0f53ecfbfc37 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 13 Oct 2025 09:02:25 -0400
Subject: [PATCH 1/4] expand-fp: Refactor modification status handling (NFC)
This is a small refactoring to set the return value
of the runImpl function which indicates whether or
not the IR has been changed in a single place instead
of doing it separately at the insertion of supported
instructions into the worklist.
---
llvm/lib/CodeGen/ExpandFp.cpp | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 04c700869cd69..4c10f701ce08b 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -941,10 +941,9 @@ static void scalarize(Instruction *I,
llvm_unreachable("Unsupported instruction type");
Result = Builder.CreateInsertElement(Result, NewOp, Idx);
- if (auto *ScalarizedI = dyn_cast<Instruction>(NewOp)) {
- ScalarizedI->copyIRFlags(I, true);
- Worklist.push_back(ScalarizedI);
- }
+ Instruction *ScalarizedI = cast<Instruction>(NewOp);
+ ScalarizedI->copyIRFlags(I, true);
+ Worklist.push_back(ScalarizedI);
}
I->replaceAllUsesWith(Result);
@@ -993,7 +992,6 @@ static void addToWorklist(Instruction &I,
static bool runImpl(Function &F, const TargetLowering &TLI,
AssumptionCache *AC) {
SmallVector<Instruction *, 4> Worklist;
- bool Modified = false;
unsigned MaxLegalFpConvertBitWidth =
TLI.getMaxLargeFPConvertBitWidthSupported();
@@ -1015,7 +1013,6 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
if (!targetSupportsFrem(TLI, Ty) &&
FRemExpander::canExpandType(Ty->getScalarType())) {
addToWorklist(I, Worklist);
- Modified = true;
}
break;
case Instruction::FPToUI:
@@ -1025,7 +1022,6 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
continue;
addToWorklist(I, Worklist);
- Modified = true;
break;
}
case Instruction::UIToFP:
@@ -1036,7 +1032,6 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
continue;
addToWorklist(I, Worklist);
- Modified = true;
break;
}
default:
@@ -1044,6 +1039,7 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
}
}
+ bool Modified = !Worklist.empty();
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
if (I->getOpcode() == Instruction::FRem) {
>From 57aa8bf501ccf44382d024c49bc0673525ed8c49 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Oct 2025 04:13:02 -0400
Subject: [PATCH 2/4] [AMDGPU] Add test cases for frem with constant operands
---
llvm/test/CodeGen/AMDGPU/frem.ll | 1358 ++++++++++++++++++++++++++++++
1 file changed, 1358 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 415828f32f920..901ce6146cc9b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -17589,5 +17589,1363 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
+
+define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_zero_num:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; SI-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: v_mov_b32_e32 v1, s8
+; SI-NEXT: v_mov_b32_e32 v3, s4
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_zero_num:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s4, s2
+; CI-NEXT: s_mov_b32 s5, s3
+; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_and_b64 s[2:3], vcc, exec
+; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0
+; CI-NEXT: s_mov_b32 s2, s6
+; CI-NEXT: s_mov_b32 s3, s7
+; CI-NEXT: v_mov_b32_e32 v1, s8
+; CI-NEXT: v_mov_b32_e32 v2, v0
+; CI-NEXT: s_and_b64 s[4:5], vcc, exec
+; CI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0
+; CI-NEXT: v_mov_b32_e32 v3, s4
+; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: frem_v2f64_const_zero_num:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-NEXT: s_cselect_b32 s0, 0x7ff80000, 0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const_zero_num:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[3:4]
+; GFX9-NEXT: s_cselect_b32 s4, 0x7ff80000, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const_zero_num:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX10-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX10-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const_zero_num:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b128 v[1:4], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX11-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const_zero_num:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT: global_load_b128 v[1:4], v0, s[2:3]
+; GFX1150-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1150-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1150-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1150-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX1150-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1150-NEXT: v_mov_b32_e32 v3, s3
+; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const_zero_num:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: global_load_b128 v[1:4], v0, s[2:3]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1200-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1200-NEXT: s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1200-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; GFX1200-NEXT: s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: v_mov_b32_e32 v3, s3
+; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
+ %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
+ %r1 = frem <2 x double> <double 0.0, double 0.0>, %r0
+ store <2 x double> %r1, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_one_denum:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT: s_cbranch_vccz .LBB15_2
+; SI-NEXT: ; %bb.1: ; %frem.else16
+; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; SI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; SI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc
+; SI-NEXT: s_mov_b64 vcc, exec
+; SI-NEXT: s_cbranch_execz .LBB15_3
+; SI-NEXT: s_branch .LBB15_8
+; SI-NEXT: .LBB15_2:
+; SI-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SI-NEXT: s_mov_b64 vcc, 0
+; SI-NEXT: .LBB15_3: ; %frem.compute15
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, 0x7ff00000
+; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; SI-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-NEXT: v_readfirstlane_b32 s2, v6
+; SI-NEXT: s_cselect_b32 s3, s2, 0
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_add_i32 s5, s3, -1
+; SI-NEXT: v_ldexp_f64 v[5:6], v[4:5], 26
+; SI-NEXT: s_cmp_lt_i32 s5, 27
+; SI-NEXT: s_cbranch_scc1 .LBB15_7
+; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; SI-NEXT: s_add_i32 s5, s3, 25
+; SI-NEXT: v_mov_b32_e32 v9, 0x43300000
+; SI-NEXT: v_mov_b32_e32 v4, 0
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: .LBB15_5: ; %frem.loop_body23
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: v_mov_b32_e32 v8, v6
+; SI-NEXT: v_mov_b32_e32 v7, v5
+; SI-NEXT: v_bfi_b32 v5, s4, v9, v8
+; SI-NEXT: v_add_f64 v[10:11], v[7:8], v[4:5]
+; SI-NEXT: v_add_f64 v[5:6], v[10:11], -v[4:5]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT: v_add_f64 v[5:6], v[7:8], -v[5:6]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[5:6]
+; SI-NEXT: v_add_f64 v[10:11], v[5:6], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; SI-NEXT: v_ldexp_f64 v[5:6], v[5:6], 26
+; SI-NEXT: s_sub_i32 s5, s5, 26
+; SI-NEXT: s_cmp_gt_i32 s5, 26
+; SI-NEXT: s_cbranch_scc1 .LBB15_5
+; SI-NEXT: ; %bb.6: ; %Flow50
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: .LBB15_7: ; %frem.loop_exit24
+; SI-NEXT: s_sub_i32 s2, s5, 25
+; SI-NEXT: v_ldexp_f64 v[4:5], v[5:6], s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3]
+; SI-NEXT: s_brev_b32 s2, -2
+; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
+; SI-NEXT: v_bfi_b32 v7, s2, v6, v5
+; SI-NEXT: v_mov_b32_e32 v6, 0
+; SI-NEXT: v_add_f64 v[8:9], v[4:5], v[6:7]
+; SI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc
+; SI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; SI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT: v_bfi_b32 v5, s2, v5, v1
+; SI-NEXT: .LBB15_8:
+; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT: s_cbranch_vccz .LBB15_10
+; SI-NEXT: ; %bb.9: ; %frem.else
+; SI-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
+; SI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; SI-NEXT: s_mov_b64 vcc, exec
+; SI-NEXT: s_cbranch_execz .LBB15_11
+; SI-NEXT: s_branch .LBB15_16
+; SI-NEXT: .LBB15_10:
+; SI-NEXT: ; implicit-def: $vgpr6_vgpr7
+; SI-NEXT: s_mov_b64 vcc, 0
+; SI-NEXT: .LBB15_11: ; %frem.compute
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_and_b32_e32 v8, 0x7fffffff, v3
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, 0x7ff00000
+; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3]
+; SI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc
+; SI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; SI-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-NEXT: v_readfirstlane_b32 s2, v8
+; SI-NEXT: s_cselect_b32 s3, s2, 0
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_add_i32 s5, s3, -1
+; SI-NEXT: v_ldexp_f64 v[7:8], v[6:7], 26
+; SI-NEXT: s_cmp_lt_i32 s5, 27
+; SI-NEXT: s_cbranch_scc1 .LBB15_15
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; SI-NEXT: s_add_i32 s5, s3, 25
+; SI-NEXT: v_mov_b32_e32 v11, 0x43300000
+; SI-NEXT: v_mov_b32_e32 v6, 0
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: .LBB15_13: ; %frem.loop_body
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: v_mov_b32_e32 v10, v8
+; SI-NEXT: v_mov_b32_e32 v9, v7
+; SI-NEXT: v_bfi_b32 v7, s4, v11, v10
+; SI-NEXT: v_add_f64 v[12:13], v[9:10], v[6:7]
+; SI-NEXT: v_add_f64 v[7:8], v[12:13], -v[6:7]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT: v_add_f64 v[7:8], v[9:10], -v[7:8]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[7:8]
+; SI-NEXT: v_add_f64 v[12:13], v[7:8], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
+; SI-NEXT: v_ldexp_f64 v[7:8], v[7:8], 26
+; SI-NEXT: s_sub_i32 s5, s5, 26
+; SI-NEXT: s_cmp_gt_i32 s5, 26
+; SI-NEXT: s_cbranch_scc1 .LBB15_13
+; SI-NEXT: ; %bb.14: ; %Flow
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: v_mov_b32_e32 v8, v10
+; SI-NEXT: .LBB15_15: ; %frem.loop_exit
+; SI-NEXT: s_sub_i32 s2, s5, 25
+; SI-NEXT: v_ldexp_f64 v[6:7], v[7:8], s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0x432fffff
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3]
+; SI-NEXT: s_brev_b32 s2, -2
+; SI-NEXT: v_mov_b32_e32 v8, 0x43300000
+; SI-NEXT: v_bfi_b32 v9, s2, v8, v7
+; SI-NEXT: v_mov_b32_e32 v8, 0
+; SI-NEXT: v_add_f64 v[10:11], v[6:7], v[8:9]
+; SI-NEXT: v_add_f64 v[8:9], v[10:11], -v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
+; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT: v_bfi_b32 v7, s2, v7, v3
+; SI-NEXT: .LBB15_16: ; %Flow49
+; SI-NEXT: s_mov_b32 s4, 0
+; SI-NEXT: s_mov_b32 s5, 0x7ff00000
+; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_one_denum:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s4, s2
+; CI-NEXT: s_mov_b32 s5, s3
+; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT: s_cbranch_vccz .LBB15_2
+; CI-NEXT: ; %bb.1: ; %frem.else16
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; CI-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; CI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; CI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc
+; CI-NEXT: s_cbranch_execz .LBB15_3
+; CI-NEXT: s_branch .LBB15_8
+; CI-NEXT: .LBB15_2:
+; CI-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CI-NEXT: .LBB15_3: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; CI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v6
+; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8
+; CI-NEXT: s_cbranch_vccnz .LBB15_7
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v8, vcc, 25, v6
+; CI-NEXT: .LBB15_5: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; CI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_subrev_i32_e32 v8, vcc, 26, v8
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8
+; CI-NEXT: s_cbranch_vccnz .LBB15_5
+; CI-NEXT: ; %bb.6: ; %Flow50
+; CI-NEXT: v_mov_b32_e32 v4, v6
+; CI-NEXT: v_mov_b32_e32 v5, v7
+; CI-NEXT: .LBB15_7: ; %frem.loop_exit24
+; CI-NEXT: v_subrev_i32_e32 v6, vcc, 25, v8
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; CI-NEXT: s_brev_b32 s2, -2
+; CI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5]
+; CI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_bfi_b32 v5, s2, v5, v1
+; CI-NEXT: .LBB15_8:
+; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT: s_cbranch_vccz .LBB15_10
+; CI-NEXT: ; %bb.9: ; %frem.else
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; CI-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; CI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
+; CI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; CI-NEXT: s_cbranch_execz .LBB15_11
+; CI-NEXT: s_branch .LBB15_16
+; CI-NEXT: .LBB15_10:
+; CI-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CI-NEXT: .LBB15_11: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v8
+; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10
+; CI-NEXT: s_cbranch_vccnz .LBB15_15
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v10, vcc, 25, v8
+; CI-NEXT: .LBB15_13: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v9, v7
+; CI-NEXT: v_mov_b32_e32 v8, v6
+; CI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9]
+; CI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc
+; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT: v_subrev_i32_e32 v10, vcc, 26, v10
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10
+; CI-NEXT: s_cbranch_vccnz .LBB15_13
+; CI-NEXT: ; %bb.14: ; %Flow
+; CI-NEXT: v_mov_b32_e32 v6, v8
+; CI-NEXT: v_mov_b32_e32 v7, v9
+; CI-NEXT: .LBB15_15: ; %frem.loop_exit
+; CI-NEXT: v_subrev_i32_e32 v8, vcc, 25, v10
+; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; CI-NEXT: s_brev_b32 s2, -2
+; CI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7]
+; CI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT: v_bfi_b32 v7, s2, v7, v3
+; CI-NEXT: .LBB15_16: ; %Flow49
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: s_mov_b32 s5, 0x7ff00000
+; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5]
+; CI-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, -1
+; CI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5]
+; CI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc
+; CI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: frem_v2f64_const_one_denum:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT: s_cbranch_vccz .LBB15_2
+; VI-NEXT: ; %bb.1: ; %frem.else16
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; VI-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; VI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; VI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc
+; VI-NEXT: s_cbranch_execz .LBB15_3
+; VI-NEXT: s_branch .LBB15_8
+; VI-NEXT: .LBB15_2:
+; VI-NEXT: ; implicit-def: $vgpr4_vgpr5
+; VI-NEXT: .LBB15_3: ; %frem.compute15
+; VI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; VI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v6
+; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8
+; VI-NEXT: s_cbranch_vccnz .LBB15_7
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; VI-NEXT: v_add_u32_e32 v8, vcc, 25, v6
+; VI-NEXT: .LBB15_5: ; %frem.loop_body23
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; VI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_subrev_u32_e32 v8, vcc, 26, v8
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8
+; VI-NEXT: s_cbranch_vccnz .LBB15_5
+; VI-NEXT: ; %bb.6: ; %Flow50
+; VI-NEXT: v_mov_b32_e32 v4, v6
+; VI-NEXT: v_mov_b32_e32 v5, v7
+; VI-NEXT: .LBB15_7: ; %frem.loop_exit24
+; VI-NEXT: v_subrev_u32_e32 v6, vcc, 25, v8
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5]
+; VI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_bfi_b32 v5, s2, v5, v1
+; VI-NEXT: .LBB15_8:
+; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT: s_cbranch_vccz .LBB15_10
+; VI-NEXT: ; %bb.9: ; %frem.else
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; VI-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
+; VI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; VI-NEXT: s_cbranch_execz .LBB15_11
+; VI-NEXT: s_branch .LBB15_16
+; VI-NEXT: .LBB15_10:
+; VI-NEXT: ; implicit-def: $vgpr6_vgpr7
+; VI-NEXT: .LBB15_11: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v8
+; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10
+; VI-NEXT: s_cbranch_vccnz .LBB15_15
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v10, vcc, 25, v8
+; VI-NEXT: .LBB15_13: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9]
+; VI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc
+; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT: v_subrev_u32_e32 v10, vcc, 26, v10
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10
+; VI-NEXT: s_cbranch_vccnz .LBB15_13
+; VI-NEXT: ; %bb.14: ; %Flow
+; VI-NEXT: v_mov_b32_e32 v6, v8
+; VI-NEXT: v_mov_b32_e32 v7, v9
+; VI-NEXT: .LBB15_15: ; %frem.loop_exit
+; VI-NEXT: v_subrev_u32_e32 v8, vcc, 25, v10
+; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7]
+; VI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT: v_bfi_b32 v7, s2, v7, v3
+; VI-NEXT: .LBB15_16: ; %Flow49
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_mov_b32 s3, 0x7ff00000
+; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; VI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const_one_denum:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_vccz .LBB15_2
+; GFX9-NEXT: ; %bb.1: ; %frem.else16
+; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc
+; GFX9-NEXT: s_cbranch_execz .LBB15_3
+; GFX9-NEXT: s_branch .LBB15_8
+; GFX9-NEXT: .LBB15_2:
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT: .LBB15_3: ; %frem.compute15
+; GFX9-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT: v_add_u32_e32 v8, -1, v6
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8
+; GFX9-NEXT: s_cbranch_vccnz .LBB15_7
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; GFX9-NEXT: v_add_u32_e32 v8, 25, v6
+; GFX9-NEXT: .LBB15_5: ; %frem.loop_body23
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX9-NEXT: v_subrev_u32_e32 v8, 26, v8
+; GFX9-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: v_add_f64 v[9:10], v[4:5], 1.0
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8
+; GFX9-NEXT: s_cbranch_vccnz .LBB15_5
+; GFX9-NEXT: ; %bb.6: ; %Flow50
+; GFX9-NEXT: v_mov_b32_e32 v4, v6
+; GFX9-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-NEXT: .LBB15_7: ; %frem.loop_exit24
+; GFX9-NEXT: v_subrev_u32_e32 v6, 25, v8
+; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX9-NEXT: s_brev_b32 s2, -2
+; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v1
+; GFX9-NEXT: .LBB15_8:
+; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_vccz .LBB15_10
+; GFX9-NEXT: ; %bb.9: ; %frem.else
+; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX9-NEXT: s_cbranch_execz .LBB15_11
+; GFX9-NEXT: s_branch .LBB15_16
+; GFX9-NEXT: .LBB15_10:
+; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX9-NEXT: .LBB15_11: ; %frem.compute
+; GFX9-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX9-NEXT: v_add_u32_e32 v10, -1, v8
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10
+; GFX9-NEXT: s_cbranch_vccnz .LBB15_15
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; GFX9-NEXT: v_add_u32_e32 v10, 25, v8
+; GFX9-NEXT: .LBB15_13: ; %frem.loop_body
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX9-NEXT: v_subrev_u32_e32 v10, 26, v10
+; GFX9-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_add_f64 v[11:12], v[6:7], 1.0
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc
+; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10
+; GFX9-NEXT: s_cbranch_vccnz .LBB15_13
+; GFX9-NEXT: ; %bb.14: ; %Flow
+; GFX9-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-NEXT: v_mov_b32_e32 v7, v9
+; GFX9-NEXT: .LBB15_15: ; %frem.loop_exit
+; GFX9-NEXT: v_subrev_u32_e32 v8, 25, v10
+; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX9-NEXT: s_brev_b32 s2, -2
+; GFX9-NEXT: v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX9-NEXT: v_bfi_b32 v7, s2, v7, v3
+; GFX9-NEXT: .LBB15_16: ; %Flow49
+; GFX9-NEXT: s_mov_b32 s2, 0
+; GFX9-NEXT: s_mov_b32 s3, 0x7ff00000
+; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const_one_denum:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT: s_cbranch_vccz .LBB15_2
+; GFX10-NEXT: ; %bb.1: ; %frem.else16
+; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX10-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB15_3
+; GFX10-NEXT: s_branch .LBB15_8
+; GFX10-NEXT: .LBB15_2:
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT: .LBB15_3: ; %frem.compute15
+; GFX10-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX10-NEXT: v_add_nc_u32_e32 v8, -1, v6
+; GFX10-NEXT: v_readfirstlane_b32 s2, v6
+; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX10-NEXT: s_cbranch_vccnz .LBB15_7
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; GFX10-NEXT: s_add_i32 s2, s2, 25
+; GFX10-NEXT: .LBB15_5: ; %frem.loop_body23
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: s_sub_i32 s2, s2, 26
+; GFX10-NEXT: s_cmp_gt_i32 s2, 26
+; GFX10-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT: v_add_f64 v[8:9], v[4:5], 1.0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX10-NEXT: s_cbranch_scc1 .LBB15_5
+; GFX10-NEXT: ; %bb.6: ; %Flow50
+; GFX10-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v8, s2
+; GFX10-NEXT: v_mov_b32_e32 v5, v7
+; GFX10-NEXT: .LBB15_7: ; %frem.loop_exit24
+; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 25, v8
+; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX10-NEXT: .LBB15_8:
+; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT: s_cbranch_vccz .LBB15_10
+; GFX10-NEXT: ; %bb.9: ; %frem.else
+; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX10-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB15_11
+; GFX10-NEXT: s_branch .LBB15_16
+; GFX10-NEXT: .LBB15_10:
+; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX10-NEXT: .LBB15_11: ; %frem.compute
+; GFX10-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v8
+; GFX10-NEXT: v_readfirstlane_b32 s2, v8
+; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX10-NEXT: s_cbranch_vccnz .LBB15_15
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; GFX10-NEXT: s_add_i32 s2, s2, 25
+; GFX10-NEXT: .LBB15_13: ; %frem.loop_body
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_mov_b32_e32 v9, v7
+; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: s_sub_i32 s2, s2, 26
+; GFX10-NEXT: s_cmp_gt_i32 s2, 26
+; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT: v_add_f64 v[10:11], v[6:7], 1.0
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX10-NEXT: s_cbranch_scc1 .LBB15_13
+; GFX10-NEXT: ; %bb.14: ; %Flow
+; GFX10-NEXT: v_mov_b32_e32 v6, v8
+; GFX10-NEXT: v_mov_b32_e32 v10, s2
+; GFX10-NEXT: v_mov_b32_e32 v7, v9
+; GFX10-NEXT: .LBB15_15: ; %frem.loop_exit
+; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 25, v10
+; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX10-NEXT: v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX10-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX10-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX10-NEXT: .LBB15_16: ; %Flow49
+; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v7, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const_one_denum:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT: s_cbranch_vccz .LBB15_2
+; GFX11-NEXT: ; %bb.1: ; %frem.else16
+; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX11-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX11-NEXT: s_cbranch_execz .LBB15_3
+; GFX11-NEXT: s_branch .LBB15_8
+; GFX11-NEXT: .LBB15_2:
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: .LBB15_3: ; %frem.compute15
+; GFX11-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX11-NEXT: v_add_nc_u32_e32 v8, -1, v6
+; GFX11-NEXT: v_readfirstlane_b32 s2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX11-NEXT: s_cbranch_vccnz .LBB15_7
+; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; GFX11-NEXT: s_add_i32 s2, s2, 25
+; GFX11-NEXT: .LBB15_5: ; %frem.loop_body23
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: s_sub_i32 s2, s2, 26
+; GFX11-NEXT: s_cmp_gt_i32 s2, 26
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT: v_add_f64 v[8:9], v[4:5], 1.0
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX11-NEXT: s_cbranch_scc1 .LBB15_5
+; GFX11-NEXT: ; %bb.6: ; %Flow50
+; GFX11-NEXT: v_mov_b32_e32 v4, v6
+; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT: .LBB15_7: ; %frem.loop_exit24
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 25, v8
+; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX11-NEXT: .LBB15_8:
+; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT: s_cbranch_vccz .LBB15_10
+; GFX11-NEXT: ; %bb.9: ; %frem.else
+; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX11-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX11-NEXT: s_cbranch_execz .LBB15_11
+; GFX11-NEXT: s_branch .LBB15_16
+; GFX11-NEXT: .LBB15_10:
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT: .LBB15_11: ; %frem.compute
+; GFX11-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v8
+; GFX11-NEXT: v_readfirstlane_b32 s2, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX11-NEXT: s_cbranch_vccnz .LBB15_15
+; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; GFX11-NEXT: s_add_i32 s2, s2, 25
+; GFX11-NEXT: .LBB15_13: ; %frem.loop_body
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT: s_sub_i32 s2, s2, 26
+; GFX11-NEXT: s_cmp_gt_i32 s2, 26
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT: v_add_f64 v[10:11], v[6:7], 1.0
+; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX11-NEXT: s_cbranch_scc1 .LBB15_13
+; GFX11-NEXT: ; %bb.14: ; %Flow
+; GFX11-NEXT: v_mov_b32_e32 v6, v8
+; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX11-NEXT: .LBB15_15: ; %frem.loop_exit
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 25, v10
+; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX11-NEXT: .LBB15_16: ; %Flow49
+; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const_one_denum:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX1150-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT: s_cbranch_vccz .LBB15_2
+; GFX1150-NEXT: ; %bb.1: ; %frem.else16
+; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX1150-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX1150-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1150-NEXT: s_cbranch_execz .LBB15_3
+; GFX1150-NEXT: s_branch .LBB15_8
+; GFX1150-NEXT: .LBB15_2:
+; GFX1150-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1150-NEXT: .LBB15_3: ; %frem.compute15
+; GFX1150-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1150-NEXT: v_add_nc_u32_e32 v8, -1, v6
+; GFX1150-NEXT: v_readfirstlane_b32 s2, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX1150-NEXT: s_cbranch_vccnz .LBB15_7
+; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; GFX1150-NEXT: s_add_i32 s2, s2, 25
+; GFX1150-NEXT: .LBB15_5: ; %frem.loop_body23
+; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX1150-NEXT: s_sub_i32 s2, s2, 26
+; GFX1150-NEXT: s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX1150-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1150-NEXT: v_add_f64 v[8:9], v[4:5], 1.0
+; GFX1150-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1150-NEXT: s_cbranch_scc1 .LBB15_5
+; GFX1150-NEXT: ; %bb.6: ; %Flow50
+; GFX1150-NEXT: v_mov_b32_e32 v4, v6
+; GFX1150-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX1150-NEXT: .LBB15_7: ; %frem.loop_exit24
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_subrev_nc_u32_e32 v6, 25, v8
+; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX1150-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1150-NEXT: v_add_f64 v[6:7], v[4:5], 1.0
+; GFX1150-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX1150-NEXT: .LBB15_8:
+; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT: s_cbranch_vccz .LBB15_10
+; GFX1150-NEXT: ; %bb.9: ; %frem.else
+; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX1150-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX1150-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX1150-NEXT: s_cbranch_execz .LBB15_11
+; GFX1150-NEXT: s_branch .LBB15_16
+; GFX1150-NEXT: .LBB15_10:
+; GFX1150-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1150-NEXT: .LBB15_11: ; %frem.compute
+; GFX1150-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1150-NEXT: v_add_nc_u32_e32 v10, -1, v8
+; GFX1150-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX1150-NEXT: s_cbranch_vccnz .LBB15_15
+; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; GFX1150-NEXT: s_add_i32 s2, s2, 25
+; GFX1150-NEXT: .LBB15_13: ; %frem.loop_body
+; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX1150-NEXT: s_sub_i32 s2, s2, 26
+; GFX1150-NEXT: s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX1150-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT: v_add_f64 v[10:11], v[6:7], 1.0
+; GFX1150-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1150-NEXT: s_cbranch_scc1 .LBB15_13
+; GFX1150-NEXT: ; %bb.14: ; %Flow
+; GFX1150-NEXT: v_mov_b32_e32 v6, v8
+; GFX1150-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX1150-NEXT: .LBB15_15: ; %frem.loop_exit
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_subrev_nc_u32_e32 v8, 25, v10
+; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX1150-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT: v_add_f64 v[8:9], v[6:7], 1.0
+; GFX1150-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX1150-NEXT: .LBB15_16: ; %Flow49
+; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX1150-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX1150-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX1150-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX1150-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const_one_denum:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX1200-NEXT: s_cbranch_vccz .LBB15_2
+; GFX1200-NEXT: ; %bb.1: ; %frem.else16
+; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX1200-NEXT: v_and_b32_e32 v4, 0x80000000, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX1200-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1200-NEXT: s_cbranch_execz .LBB15_3
+; GFX1200-NEXT: s_branch .LBB15_8
+; GFX1200-NEXT: .LBB15_2:
+; GFX1200-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1200-NEXT: .LBB15_3: ; %frem.compute15
+; GFX1200-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1200-NEXT: v_add_nc_u32_e32 v8, -1, v6
+; GFX1200-NEXT: v_readfirstlane_b32 s2, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX1200-NEXT: s_cbranch_vccnz .LBB15_7
+; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
+; GFX1200-NEXT: s_add_co_i32 s2, s2, 25
+; GFX1200-NEXT: .LBB15_5: ; %frem.loop_body23
+; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26
+; GFX1200-NEXT: s_cmp_gt_i32 s2, 26
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[4:5]
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1200-NEXT: s_cbranch_scc1 .LBB15_5
+; GFX1200-NEXT: ; %bb.6: ; %Flow50
+; GFX1200-NEXT: v_mov_b32_e32 v4, v6
+; GFX1200-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX1200-NEXT: .LBB15_7: ; %frem.loop_exit24
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_subrev_nc_u32_e32 v6, 25, v8
+; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[6:7]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1200-NEXT: v_add_f64_e32 v[6:7], 1.0, v[4:5]
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX1200-NEXT: .LBB15_8:
+; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_cbranch_vccz .LBB15_10
+; GFX1200-NEXT: ; %bb.9: ; %frem.else
+; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX1200-NEXT: v_and_b32_e32 v6, 0x80000000, v3
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX1200-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX1200-NEXT: s_cbranch_execz .LBB15_11
+; GFX1200-NEXT: s_branch .LBB15_16
+; GFX1200-NEXT: .LBB15_10:
+; GFX1200-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1200-NEXT: .LBB15_11: ; %frem.compute
+; GFX1200-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1200-NEXT: v_add_nc_u32_e32 v10, -1, v8
+; GFX1200-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX1200-NEXT: s_cbranch_vccnz .LBB15_15
+; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader
+; GFX1200-NEXT: s_add_co_i32 s2, s2, 25
+; GFX1200-NEXT: .LBB15_13: ; %frem.loop_body
+; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_cmp_gt_i32 s2, 26
+; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[6:7]
+; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1200-NEXT: v_add_f64_e32 v[10:11], 1.0, v[6:7]
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1200-NEXT: s_cbranch_scc1 .LBB15_13
+; GFX1200-NEXT: ; %bb.14: ; %Flow
+; GFX1200-NEXT: v_mov_b32_e32 v6, v8
+; GFX1200-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX1200-NEXT: .LBB15_15: ; %frem.loop_exit
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_subrev_nc_u32_e32 v8, 25, v10
+; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[8:9]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[6:7]
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX1200-NEXT: .LBB15_16: ; %Flow49
+; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
+ %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
+ %r1 = frem <2 x double> %r0, <double 1.0, double 1.0>
+ store <2 x double> %r1, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
+; SI-LABEL: frem_v2f64_const:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: v_mov_b32_e32 v3, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; CI-LABEL: frem_v2f64_const:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, -1
+; CI-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; CI-NEXT: v_mov_b32_e32 v2, v0
+; CI-NEXT: v_mov_b32_e32 v3, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: frem_v2f64_const:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: v_mov_b32_e32 v3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1150-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_mov_b32_e32 v2, v0
+; GFX1150-NEXT: v_mov_b32_e32 v3, v0
+; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1200-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
+ %r0 = frem <2 x double> <double 1.0, double 1.0>, <double 2.0, double 1.0>
+ store <2 x double> %r0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+
+
attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+
>From 221d554ba5fb7a3782e7babe596ed35af110b54d Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Oct 2025 04:14:09 -0400
Subject: [PATCH 3/4] expand-fp: Revert "scalarize" changes
The "dyn_cast" needs to be there as witnessed by the test case in
CodeGen/AMDGPU/frem.ll with two constant vector operands.
Refactor the loop that visits the instructions to allow for a single
assignment to the "Modified" variable.
---
llvm/lib/CodeGen/ExpandFp.cpp | 48 +++++++++++++++++------------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 4c10f701ce08b..832d105480e0e 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -941,9 +941,10 @@ static void scalarize(Instruction *I,
llvm_unreachable("Unsupported instruction type");
Result = Builder.CreateInsertElement(Result, NewOp, Idx);
- Instruction *ScalarizedI = cast<Instruction>(NewOp);
- ScalarizedI->copyIRFlags(I, true);
- Worklist.push_back(ScalarizedI);
+ if (auto *ScalarizedI = dyn_cast<Instruction>(NewOp)) {
+ ScalarizedI->copyIRFlags(I, true);
+ Worklist.push_back(ScalarizedI);
+ }
}
I->replaceAllUsesWith(Result);
@@ -1001,45 +1002,44 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS)
return false;
- for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
- Instruction &I = *It++;
+ auto ShouldHandleInst = [&](Instruction &I) {
Type *Ty = I.getType();
// TODO: This pass doesn't handle scalable vectors.
if (Ty->isScalableTy())
- continue;
+ return false;
switch (I.getOpcode()) {
case Instruction::FRem:
- if (!targetSupportsFrem(TLI, Ty) &&
- FRemExpander::canExpandType(Ty->getScalarType())) {
- addToWorklist(I, Worklist);
- }
- break;
+ return !targetSupportsFrem(TLI, Ty) &&
+ FRemExpander::canExpandType(Ty->getScalarType());
+
case Instruction::FPToUI:
case Instruction::FPToSI: {
auto *IntTy = cast<IntegerType>(Ty->getScalarType());
- if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
- continue;
-
- addToWorklist(I, Worklist);
- break;
+ return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth;
}
+
case Instruction::UIToFP:
case Instruction::SIToFP: {
auto *IntTy =
cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
- if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
- continue;
-
- addToWorklist(I, Worklist);
- break;
+ return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth;
}
- default:
- break;
}
+
+ return false;
+ };
+
+ bool Modified = false;
+ for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
+ Instruction &I = *It++;
+ if (!ShouldHandleInst(I))
+ continue;
+
+ addToWorklist(I, Worklist);
+ Modified = true;
}
- bool Modified = !Worklist.empty();
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
if (I->getOpcode() == Instruction::FRem) {
>From f6407d4637d7d53cb96a39374c9458046a8b04a8 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Oct 2025 04:35:55 -0400
Subject: [PATCH 4/4] expand-fp: Replace if-else by switch-case in Worklist
processing
This does reflect the structure of the instruction visiting function
and is more readable.
---
llvm/lib/CodeGen/ExpandFp.cpp | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 832d105480e0e..2b5ced3915a2c 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -1042,7 +1042,9 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
- if (I->getOpcode() == Instruction::FRem) {
+
+ switch (I->getOpcode()) {
+ case Instruction::FRem: {
auto SQ = [&]() -> std::optional<SimplifyQuery> {
if (AC) {
auto Res = std::make_optional<SimplifyQuery>(
@@ -1054,11 +1056,18 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
}();
expandFRem(cast<BinaryOperator>(*I), SQ);
- } else if (I->getOpcode() == Instruction::FPToUI ||
- I->getOpcode() == Instruction::FPToSI) {
+ break;
+ }
+
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
expandFPToI(I);
- } else {
+ break;
+
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
expandIToFP(I);
+ break;
}
}
More information about the llvm-commits
mailing list