[llvm] [GlobalISel][AArch64][AMDGPU] Lower FPOWI into series of multiplication (PR #95217)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 12 05:05:08 PDT 2024


https://github.com/isuckatcs updated https://github.com/llvm/llvm-project/pull/95217

>From fefe3628fd57c9e1ecbc9e3e668a1c6dfe943c18 Mon Sep 17 00:00:00 2001
From: isuckatcs <65320245+isuckatcs at users.noreply.github.com>
Date: Wed, 12 Jun 2024 11:38:41 +0200
Subject: [PATCH] [GlobalISel][AArch64][AMDGPU] Lower FPOWI into series of
 multiplication

SelectionDAG already converts FPOWI into multiplications, this patch
introduces the same optimization into GlobalISel.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   33 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   17 +-
 .../AArch64/GISel/AArch64LegalizerInfo.h      |    2 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   22 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |    3 +
 .../AArch64/GlobalISel/legalize-fpowi.mir     |  145 ++
 .../CodeGen/AMDGPU/GlobalISel/llvm.powi.ll    | 1254 +++++++++++++----
 7 files changed, 1169 insertions(+), 307 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9830b521797c1..3549606ef9490 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7142,14 +7142,37 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
   return UnableToLegalize;
 }
 
-// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
-// multiplication tree.
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
-  auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+  auto [Dst, Base, Exp] = MI.getFirst3Regs();
   LLT Ty = MRI.getType(Dst);
 
-  auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
-  MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  std::optional<int64_t> ConstantExpValue = getIConstantVRegSExtVal(Exp, MRI);
+
+  if (!ConstantExpValue)
+    return UnableToLegalize;
+
+  int64_t OriginalExprVal = *ConstantExpValue;
+  int64_t ExpVal = OriginalExprVal;
+
+  if (ExpVal == 0) {
+    MIRBuilder.buildFConstant(Dst, 1.0);
+    MI.removeFromParent();
+    return Legalized;
+  }
+
+  if (ExpVal < 0)
+    ExpVal = -ExpVal;
+
+  auto Res = MIRBuilder.buildCopy(Ty, Base);
+  while (--ExpVal > 0)
+    Res = MIRBuilder.buildFMul(Ty, Res, Base);
+
+  // If the original was negative, invert the result, producing 1/(x*x*x).
+  if (OriginalExprVal < 0)
+    Res = MIRBuilder.buildFDiv(Ty, MIRBuilder.buildFConstant(Ty, 1.0), Res);
+
+  MIRBuilder.buildCopy(Dst, Res);
   MI.eraseFromParent();
   return Legalized;
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 42cd43c3afa37..0a782c9fb0891 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -274,10 +274,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
       .minScalar(0, s32)
       .libcallFor({s32, s64});
-  getActionDefinitionsBuilder(G_FPOWI)
-      .scalarize(0)
-      .minScalar(0, s32)
-      .libcallFor({{s32, s32}, {s64, s32}});
+  getActionDefinitionsBuilder(G_FPOWI).scalarize(0).minScalar(0, s32).customFor(
+      {{s32, s32}, {s64, s32}});
 
   getActionDefinitionsBuilder(G_INSERT)
       .legalIf(all(typeInSet(0, {s32, s64, p0}),
@@ -1263,6 +1261,8 @@ bool AArch64LegalizerInfo::legalizeCustom(
   case TargetOpcode::G_FSHL:
   case TargetOpcode::G_FSHR:
     return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
+  case TargetOpcode::G_FPOWI:
+    return legalizeFPowI(MI, LocObserver, Helper);
   case TargetOpcode::G_ROTR:
     return legalizeRotate(MI, MRI, Helper);
   case TargetOpcode::G_CTPOP:
@@ -1344,6 +1344,15 @@ bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
   return true;
 }
 
+bool AArch64LegalizerInfo::legalizeFPowI(MachineInstr &MI,
+                                         LostDebugLocObserver &Observer,
+                                         LegalizerHelper &Helper) const {
+  if (Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+    return true;
+
+  return Helper.libcall(MI, Observer) == LegalizerHelper::Legalized;
+}
+
 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
                                         MachineRegisterInfo &MRI,
                                         MachineIRBuilder &MIRBuilder) const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 00d85a36e4b2c..01f6541d1ac1d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -56,6 +56,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
                            MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer,
                            LegalizerHelper &Helper) const;
+  bool legalizeFPowI(MachineInstr &MI, LostDebugLocObserver &Observer,
+                     LegalizerHelper &Helper) const;
   bool legalizeCTPOP(MachineInstr &MI, MachineRegisterInfo &MRI,
                      LegalizerHelper &Helper) const;
   bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee7fb20c23aa7..4841e72b19bcf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1216,8 +1216,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         .scalarize(0);
 
   getActionDefinitionsBuilder(G_FPOWI)
-    .clampScalar(0, MinScalarFPTy, S32)
-    .lower();
+      .clampScalar(0, MinScalarFPTy, S32)
+      .custom();
 
   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
   Log2Ops.customFor({S32});
@@ -2127,6 +2127,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
     return legalizeFExp(MI, B);
   case TargetOpcode::G_FPOW:
     return legalizeFPow(MI, B);
+  case TargetOpcode::G_FPOWI:
+    return legalizeFPowI(Helper, MI, B, LocObserver);
   case TargetOpcode::G_FFLOOR:
     return legalizeFFloor(MI, MRI, B);
   case TargetOpcode::G_BUILD_VECTOR:
@@ -3731,6 +3733,22 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeFPowI(
+    LegalizerHelper &Helper, MachineInstr &MI, MachineIRBuilder &B,
+    LostDebugLocObserver &LocObserver) const {
+  if (Helper.lowerFPOWI(MI) == LegalizerHelper::Legalized)
+    return true;
+
+  auto [Dst, Base, Exp] = MI.getFirst3Regs();
+  LLT Ty = B.getMRI()->getType(Dst);
+
+  auto CvtSrc1 = B.buildSITOFP(Ty, Exp);
+  B.buildFPow(Dst, Base, CvtSrc1, MI.getFlags());
+  MI.eraseFromParent();
+
+  return true;
+}
+
 // Find a source register, ignoring any possible source modifiers.
 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
   Register ModSrc = OrigSrc;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc21..72910fa1828d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -95,6 +95,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
                           unsigned Flags) const;
   bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
+  bool legalizeFPowI(LegalizerHelper &Helper, MachineInstr &MI,
+                     MachineIRBuilder &B,
+                     LostDebugLocObserver &LocObserver) const;
   bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
                       MachineIRBuilder &B) const;
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
new file mode 100644
index 0000000000000..c4261277d343e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpowi.mir
@@ -0,0 +1,145 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: fpowi_s64_zero
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s64_zero
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; CHECK-NEXT: $d0 = COPY [[C]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 0
+      %3:_(s64) = G_FPOWI %0, %2(s32)
+      $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_zero
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s32_zero
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; CHECK-NEXT: $s0 = COPY [[C]](s32)
+      %0:_(s32) = COPY $s0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 0
+      %3:_(s32) = G_FPOWI %0, %2(s32)
+      $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_positive
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_positive
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FMUL3]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 5
+      %3:_(s64) = G_FPOWI %0, %2(s32)
+      $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s64_negative
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s64_negative
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY2]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[FMUL1]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FMUL2]], [[COPY]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s64) = G_FDIV [[C]], [[FMUL3]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[FDIV]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[COPY3]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 -5
+      %3:_(s64) = G_FPOWI %0, %2(s32)
+      $d0 = COPY %3(s64)
+...
+
+---
+name: fpowi_s32_negative
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_s32_negative
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY2]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL1]], [[COPY]]
+    ; CHECK-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMUL2]], [[COPY]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[C]], [[FMUL3]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[FDIV]](s32)
+    ; CHECK-NEXT: $s0 = COPY [[COPY3]](s32)
+      %0:_(s32) = COPY $s0
+      %1:_(s32) = COPY $w0
+      %2:_(s32) = G_CONSTANT i32 -5
+      %3:_(s32) = G_FPOWI %0, %2(s32)
+      $s0 = COPY %3(s32)
+...
+
+---
+name: fpowi_libcall
+body: |
+    bb.0:
+      liveins: $d0, $w0
+
+    ; CHECK-LABEL: name: fpowi_libcall
+    ; CHECK: liveins: $d0, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $d0 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: BL &__powidf2, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $w0, implicit-def $d0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
+      %0:_(s64) = COPY $d0
+      %1:_(s32) = COPY $w0
+      %2:_(s64) = G_FPOWI %0, %1(s32)
+      $d0 = COPY %2(s64)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index b169063d67872..5bd9378a6cf77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -141,51 +141,57 @@ define float @v_powi_1_f32(float %l) {
 }
 
 define float @v_powi_neg1_f32(float %l) {
-; GFX78-LABEL: v_powi_neg1_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, -1.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg1_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg1_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg1_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, -1.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -1)
   ret float %res
@@ -195,99 +201,74 @@ define float @v_powi_2_f32(float %l) {
 ; GFX78-LABEL: v_powi_2_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_2_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 2.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 2)
   ret float %res
 }
 
 define float @v_powi_neg2_f32(float %l) {
-; GFX78-LABEL: v_powi_neg2_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, -2.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg2_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg2_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg2_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, -2.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -2)
   ret float %res
@@ -297,48 +278,18 @@ define float @v_powi_4_f32(float %l) {
 ; GFX78-LABEL: v_powi_4_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 4.0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_4_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 4.0, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 4)
   ret float %res
@@ -348,48 +299,28 @@ define float @v_powi_8_f32(float %l) {
 ; GFX78-LABEL: v_powi_8_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0x41000000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_8_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 8)
   ret float %res
@@ -399,48 +330,48 @@ define float @v_powi_16_f32(float %l) {
 ; GFX78-LABEL: v_powi_16_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0x41800000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_16_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 16)
   ret float %res
@@ -450,99 +381,830 @@ define float @v_powi_128_f32(float %l) {
 ; GFX78-LABEL: v_powi_128_f32:
 ; GFX78:       ; %bb.0:
 ; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0x43000000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    v_mul_f32_e32 v1, v0, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX78-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_128_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 128)
   ret float %res
 }
 
 define float @v_powi_neg128_f32(float %l) {
-; GFX78-LABEL: v_powi_neg128_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, 0xc3000000, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_neg128_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v1, v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_neg128_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v1, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX8-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg128_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
+; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v1, v3, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, -v1, v3, v4
+; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -128)
   ret float %res



More information about the llvm-commits mailing list