[llvm] [GlobalISel][AArch64][AMDGPU] Expand FPOWI into series of multiplication (PR #95217)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 23 06:21:06 PDT 2024
https://github.com/isuckatcs updated https://github.com/llvm/llvm-project/pull/95217
>From 49c291a459d7aa868652c587dd7dc80ee70b4e1e Mon Sep 17 00:00:00 2001
From: isuckatcs <65320245+isuckatcs at users.noreply.github.com>
Date: Thu, 13 Jun 2024 11:36:26 +0200
Subject: [PATCH] [GlobalISel][AArch64][AMDGPU] Expand FPOWI into series of
multiplication
SelectionDAG already converts FPOWI into multiplications, this patch
introduces the same optimization into GlobalISel.
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 7 +
.../include/llvm/Target/GlobalISel/Combine.td | 9 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 53 ++
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 2 -
.../GlobalISel/combine-fpowi-optsize.ll | 26 +
.../AArch64/GlobalISel/combine-fpowi.mir | 124 +++++
.../CodeGen/AMDGPU/GlobalISel/llvm.powi.ll | 497 +++++++-----------
7 files changed, 418 insertions(+), 300 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll
create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 43659564d5ace..37a56e12efcc3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -865,6 +865,13 @@ class CombinerHelper {
/// By default, it erases the instruction def'd on \p MO from the function.
void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo);
+ /// Match FPOWI if it's safe to extend it into a series of multiplications.
+ bool matchFPowIExpansion(MachineInstr &MI, int64_t Exponent);
+
+ /// Expands FPOWI into a series of multiplications and a division if the
+ /// exponent is negative.
+ void applyExpandFPowI(MachineInstr &MI, int64_t Exponent);
+
/// Combine insert vector element OOB.
bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index bd43b95899030..b0789fca630e8 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1636,6 +1636,13 @@ def sub_of_vscale : GICombineRule<
[{ return Helper.matchSubOfVScale(${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+def expand_const_fpowi : GICombineRule<
+ (defs root:$root),
+ (match (G_CONSTANT $int, $imm),
+ (G_FPOWI $dst, $float, $int):$root,
+ [{ return Helper.matchFPowIExpansion(*${root}, ${imm}.getCImm()->getSExtValue()); }]),
+ (apply [{ Helper.applyExpandFPowI(*${root}, ${imm}.getCImm()->getSExtValue()); }])>;
+
// match_extract_of_element and insert_vector_elt_oob must be the first!
def vector_ops_combines: GICombineGroup<[
match_extract_of_element_undef_vector,
@@ -1786,7 +1793,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
overlapping_and, mulo_by_2, mulo_by_0,
adde_to_addo,
- combine_minmax_nan]>;
+ combine_minmax_nan, expand_const_fpowi]>;
def known_bits_simplifications : GICombineGroup<[
redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 02d85958fc7be..3743ef69d6d44 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7356,6 +7356,59 @@ void CombinerHelper::applyBuildFnMO(const MachineOperand &MO,
Root->eraseFromParent();
}
+// FIXME: This is ported from TargetLoweringBase::isBeneficialToExpandPowI,
+// consider moving the two to some common place.
+bool CombinerHelper::matchFPowIExpansion(MachineInstr &MI, int64_t Exponent) {
+ bool OptForSize = MI.getMF()->getFunction().hasOptSize();
+ if (Exponent < 0)
+ Exponent = -Exponent;
+ uint64_t E = static_cast<uint64_t>(Exponent);
+ return !OptForSize || (llvm::popcount(E) + Log2_64(E) < 7);
+}
+
+void CombinerHelper::applyExpandFPowI(MachineInstr &MI, int64_t Exponent) {
+ auto [Dst, Base] = MI.getFirst2Regs();
+ LLT Ty = MRI.getType(Dst);
+ int64_t ExpVal = Exponent;
+
+ if (ExpVal == 0) {
+ Builder.buildFConstant(Dst, 1.0);
+ MI.removeFromParent();
+ return;
+ }
+
+ if (ExpVal < 0)
+ ExpVal = -ExpVal;
+
+ // We use the simple binary decomposition method from SelectionDAG ExpandPowI
+ // to generate the multiply sequence. There are more optimal ways to do this
+ // (for example, powi(x,15) generates one more multiply than it should), but
+ // this has the benefit of being both really simple and much better than a
+ // libcall.
+ std::optional<SrcOp> Res;
+ SrcOp CurSquare = Base;
+ while (ExpVal > 0) {
+ if (ExpVal & 1) {
+ if (!Res)
+ Res = CurSquare;
+ else
+ Res = Builder.buildFMul(Ty, *Res, CurSquare);
+ }
+
+ CurSquare = Builder.buildFMul(Ty, CurSquare, CurSquare);
+ ExpVal >>= 1;
+ }
+
+ // If the original exponent was negative, invert the result, producing
+ // 1/(x*x*x).
+ if (Exponent < 0)
+ Res = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0), *Res,
+ MI.getFlags());
+
+ Builder.buildCopy(Dst, *Res);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
BuildFnTy &MatchInfo) {
GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9830b521797c1..9c9409b1aff17 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7142,8 +7142,6 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
return UnableToLegalize;
}
-// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
-// multiplication tree.
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
auto [Dst, Src0, Src1] = MI.getFirst3Regs();
LLT Ty = MRI.getType(Dst);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll
new file mode 100644
index 0000000000000..1add4a86aee27
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s
+
+define double @pow_optsize(double %x) nounwind optsize {
+; CHECK-LABEL: pow_optsize:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w0, #15 // =0xf
+; CHECK-NEXT: b __powidf2
+entry:
+ %0 = call double @llvm.powi.f64.i32(double %x, i32 15)
+ ret double %0
+}
+
+define double @pow_optsize_expand(double %x) nounwind optsize {
+; CHECK-LABEL: pow_optsize_expand:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmul d0, d0, d0
+; CHECK-NEXT: fmul d0, d0, d0
+; CHECK-NEXT: fmul d0, d0, d0
+; CHECK-NEXT: fmul d0, d0, d0
+; CHECK-NEXT: ret
+entry:
+ %0 = call double @llvm.powi.f64.i32(double %x, i32 16)
+ ret double %0
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir
new file mode 100644
index 0000000000000..8b8158348e399
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir
@@ -0,0 +1,124 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s
+
+---
+name: fpowi_s64_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: $d0 = COPY [[C]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: $s0 = COPY [[C]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_positive
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_positive
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[FMUL]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMUL1]]
+ ; CHECK-NEXT: $d0 = COPY [[FMUL2]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 5
+ %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s64_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[FMUL]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMUL1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FDIV [[C]], [[FMUL2]]
+ ; CHECK-NEXT: $d0 = COPY [[FDIV]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[FMUL]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[FMUL1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FDIV [[C]], [[FMUL2]]
+ ; CHECK-NEXT: $s0 = COPY [[FDIV]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_libcall
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_libcall
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[FPOWI:%[0-9]+]]:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI [[COPY]], [[COPY1]](s32)
+ ; CHECK-NEXT: $d0 = COPY [[FPOWI]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %1(s32)
+ $d0 = COPY %2(s64)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index b169063d67872..9d586e3e4a09a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -141,51 +141,57 @@ define float @v_powi_1_f32(float %l) {
}
define float @v_powi_neg1_f32(float %l) {
-; GFX78-LABEL: v_powi_neg1_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg1_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg1_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg1_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -1)
ret float %res
@@ -195,99 +201,74 @@ define float @v_powi_2_f32(float %l) {
; GFX78-LABEL: v_powi_2_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_2_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 2)
ret float %res
}
define float @v_powi_neg2_f32(float %l) {
-; GFX78-LABEL: v_powi_neg2_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg2_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg2_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg2_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -2.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -2)
ret float %res
@@ -297,48 +278,16 @@ define float @v_powi_4_f32(float %l) {
; GFX78-LABEL: v_powi_4_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_4_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 4.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 4)
ret float %res
@@ -348,48 +297,18 @@ define float @v_powi_8_f32(float %l) {
; GFX78-LABEL: v_powi_8_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_8_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 8)
ret float %res
@@ -399,48 +318,21 @@ define float @v_powi_16_f32(float %l) {
; GFX78-LABEL: v_powi_16_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_16_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 16)
ret float %res
@@ -450,99 +342,110 @@ define float @v_powi_128_f32(float %l) {
; GFX78-LABEL: v_powi_128_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_128_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 128)
ret float %res
}
define float @v_powi_neg128_f32(float %l) {
-; GFX78-LABEL: v_powi_neg128_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg128_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg128_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg128_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -128)
ret float %res
More information about the llvm-commits
mailing list