[llvm] [GlobalISel][AArch64][AMDGPU] Lower FPOWI into series of multiplication (PR #95217)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 12 05:05:08 PDT 2024
https://github.com/isuckatcs updated https://github.com/llvm/llvm-project/pull/95217
>From fefe3628fd57c9e1ecbc9e3e668a1c6dfe943c18 Mon Sep 17 00:00:00 2001
From: isuckatcs <65320245+isuckatcs at users.noreply.github.com>
Date: Wed, 12 Jun 2024 11:38:41 +0200
Subject: [PATCH] [GlobalISel][AArch64][AMDGPU] Lower FPOWI into series of
multiplication
SelectionDAG already converts FPOWI into multiplications, this patch
introduces the same optimization into GlobalISel.
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 33 +-
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 17 +-
.../AArch64/GISel/AArch64LegalizerInfo.h | 2 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 22 +-
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 +
.../AArch64/GlobalISel/legalize-fpowi.mir | 145 ++
.../CodeGen/AMDGPU/GlobalISel/llvm.powi.ll | 1254 +++++++++++++----
7 files changed, 1169 insertions(+), 307 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9830b521797c1..3549606ef9490 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7142,14 +7142,37 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
return UnableToLegalize;
}
-// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
-// multiplication tree.
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
- auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+ auto [Dst, Base, Exp] = MI.getFirst3Regs();
LLT Ty = MRI.getType(Dst);
- auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
- MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ std::optional<int64_t> ConstantExpValue = getIConstantVRegSExtVal(Exp, MRI);
+
+ if (!ConstantExpValue)
+ return UnableToLegalize;
+
+ int64_t OriginalExprVal = *ConstantExpValue;
+ int64_t ExpVal = OriginalExprVal;
+
+ if (ExpVal == 0) {
+ MIRBuilder.buildFConstant(Dst, 1.0);
+ MI.removeFromParent();
+ return Legalized;
+ }
+
+ if (ExpVal < 0)
+ ExpVal = -ExpVal;
+
+ auto Res = MIRBuilder.buildCopy(Ty, Base);
+ while (--ExpVal > 0)
+ Res = MIRBuilder.buildFMul(Ty, Res, Base);
+
+ // If the original was negative, invert the result, producing 1/(x*x*x).
+ if (OriginalExprVal < 0)
+ Res = MIRBuilder.buildFDiv(Ty, MIRBuilder.buildFConstant(Ty, 1.0), Res);
+
+ MIRBuilder.buildCopy(Dst, Res);
MI.eraseFromParent();
return Legalized;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 42cd43c3afa37..0a782c9fb0891 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -274,10 +274,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// Regardless of FP16 support, widen 16-bit elements to 32-bits.
.minScalar(0, s32)
.libcallFor({s32, s64});
- getActionDefinitionsBuilder(G_FPOWI)
- .scalarize(0)
- .minScalar(0, s32)
- .libcallFor({{s32, s32}, {s64, s32}});
+ getActionDefinitionsBuilder(G_FPOWI).scalarize(0).minScalar(0, s32).customFor(
+ {{s32, s32}, {s64, s32}});
getActionDefinitionsBuilder(G_INSERT)
.legalIf(all(typeInSet(0, {s32, s64, p0}),
@@ -1263,6 +1261,8 @@ bool AArch64LegalizerInfo::legalizeCustom(
case TargetOpcode::G_FSHL:
case TargetOpcode::G_FSHR:
return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
+ case TargetOpcode::G_FPOWI:
+ return legalizeFPowI(MI, LocObserver, Helper);
case TargetOpcode::G_ROTR:
return legalizeRotate(MI, MRI, Helper);
case TargetOpcode::G_CTPOP:
@@ -1344,6 +1344,15 @@ bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
return true;
}
+bool AArch64LegalizerInfo::legalizeFPowI(MachineInstr &MI,
+ LostDebugLocObserver &Observer,
+ LegalizerHelper &Helper) const {
+ if (Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+ return true;
+
+ return Helper.libcall(MI, Observer) == LegalizerHelper::Legalized;
+}
+
bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 00d85a36e4b2c..01f6541d1ac1d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -56,6 +56,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer,
LegalizerHelper &Helper) const;
+ bool legalizeFPowI(MachineInstr &MI, LostDebugLocObserver &Observer,
+ LegalizerHelper &Helper) const;
bool legalizeCTPOP(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee7fb20c23aa7..4841e72b19bcf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1216,8 +1216,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
getActionDefinitionsBuilder(G_FPOWI)
- .clampScalar(0, MinScalarFPTy, S32)
- .lower();
+ .clampScalar(0, MinScalarFPTy, S32)
+ .custom();
auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
Log2Ops.customFor({S32});
@@ -2127,6 +2127,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeFExp(MI, B);
case TargetOpcode::G_FPOW:
return legalizeFPow(MI, B);
+ case TargetOpcode::G_FPOWI:
+ return legalizeFPowI(Helper, MI, B, LocObserver);
case TargetOpcode::G_FFLOOR:
return legalizeFFloor(MI, MRI, B);
case TargetOpcode::G_BUILD_VECTOR:
@@ -3731,6 +3733,22 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFPowI(
+ LegalizerHelper &Helper, MachineInstr &MI, MachineIRBuilder &B,
+ LostDebugLocObserver &LocObserver) const {
+ if (Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+ return true;
+
+ auto [Dst, Base, Exp] = MI.getFirst3Regs();
+ LLT Ty = B.getMRI()->getType(Dst);
+
+ auto CvtSrc1 = B.buildSITOFP(Ty, Exp);
+ B.buildFPow(Dst, Base, CvtSrc1, MI.getFlags());
+ MI.eraseFromParent();
+
+ return true;
+}
+
// Find a source register, ignoring any possible source modifiers.
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
Register ModSrc = OrigSrc;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc21..72910fa1828d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -95,6 +95,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
unsigned Flags) const;
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFPowI(LegalizerHelper &Helper, MachineInstr &MI,
+ MachineIRBuilder &B,
+ LostDebugLocObserver &LocObserver) const;
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
new file mode 100644
index 0000000000000..c4261277d343e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
@@ -0,0 +1,145 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: fpowi_s64_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: $d0 = COPY [[C]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_zero
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_zero
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: $s0 = COPY [[C]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s32) = G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_positive
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_positive
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FMUL3]](s64)
+ ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 5
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s64_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s64_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s64) = G_FDIV [[C]], [[FMUL3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FDIV]](s64)
+ ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s64) = G_FPOWI %0, %2(s32)
+ $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_negative
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_s32_negative
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY2]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[COPY]]
+ ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[COPY]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[C]], [[FMUL3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[FDIV]](s32)
+ ; CHECK-NEXT: $s0 = COPY [[COPY3]](s32)
+ %0:_(s32) = COPY $s0
+ %1:_(s32) = COPY $w0
+ %2:_(s32) = G_CONSTANT i32 -5
+ %3:_(s32) = G_FPOWI %0, %2(s32)
+ $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_libcall
+body: |
+ bb.0:
+ liveins: $d0, $w0
+
+ ; CHECK-LABEL: name: fpowi_libcall
+ ; CHECK: liveins: $d0, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $d0 = COPY [[COPY]](s64)
+ ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: BL &__powidf2, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $w0, implicit-def $d0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
+ %0:_(s64) = COPY $d0
+ %1:_(s32) = COPY $w0
+ %2:_(s64) = G_FPOWI %0, %1(s32)
+ $d0 = COPY %2(s64)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index b169063d67872..5bd9378a6cf77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -141,51 +141,57 @@ define float @v_powi_1_f32(float %l) {
}
define float @v_powi_neg1_f32(float %l) {
-; GFX78-LABEL: v_powi_neg1_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg1_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg1_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg1_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -1)
ret float %res
@@ -195,99 +201,74 @@ define float @v_powi_2_f32(float %l) {
; GFX78-LABEL: v_powi_2_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_2_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 2)
ret float %res
}
define float @v_powi_neg2_f32(float %l) {
-; GFX78-LABEL: v_powi_neg2_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg2_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg2_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg2_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -2.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -2)
ret float %res
@@ -297,48 +278,18 @@ define float @v_powi_4_f32(float %l) {
; GFX78-LABEL: v_powi_4_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_4_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 4.0, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 4)
ret float %res
@@ -348,48 +299,28 @@ define float @v_powi_8_f32(float %l) {
; GFX78-LABEL: v_powi_8_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_8_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 8)
ret float %res
@@ -399,48 +330,48 @@ define float @v_powi_16_f32(float %l) {
; GFX78-LABEL: v_powi_16_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_16_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 16)
ret float %res
@@ -450,99 +381,830 @@ define float @v_powi_128_f32(float %l) {
; GFX78-LABEL: v_powi_128_f32:
; GFX78: ; %bb.0:
; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT: v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX78-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_128_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 128)
ret float %res
}
define float @v_powi_neg128_f32(float %l) {
-; GFX78-LABEL: v_powi_neg128_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg128_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT: v_rcp_f32_e32 v2, v1
+; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg128_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f32_e32 v1, v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v1
+; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_neg128_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_log_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 -128)
ret float %res
More information about the llvm-commits
mailing list