[llvm] LoopIdiomRecognize: detect and convert powi idiom (PR #72650)

Fri Dec 1 07:29:47 PST 2023

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/72650

>From 392e99c727b6c0fb376e924e1e6fd1043f8970b2 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra at imgtec.com>
Date: Fri, 17 Nov 2023 12:35:28 +0000
Subject: [PATCH 1/2] LoopIdiomRecognize: add negative tests for powi idiom

The following code, when compiled under -ffast-math, produces bad
codegen due to LoopVectorize:

  float powi(float base, int exp) {
    float result = 1.0;
      for (int i = 0; i < exp; ++i)
        result *= base;
    return result;
  }

It can easily be replaced with the llvm.powi intrinsic, when the
exponent is a C int type. This is the job of LoopIdiomRecognize, and has
been marked as a TODO item for years. In preparation to fulfill this
wish, add negative tests corresponding to variations of this program.
---
 llvm/test/Transforms/LoopIdiom/powi.ll | 471 +++++++++++++++++++++++++
 1 file changed, 471 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopIdiom/powi.ll

diff --git a/llvm/test/Transforms/LoopIdiom/powi.ll b/llvm/test/Transforms/LoopIdiom/powi.ll
new file mode 100644
index 000000000000000..807425c214ea692
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/powi.ll
@@ -0,0 +1,471 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes='loop(loop-idiom,loop-deletion,indvars),function(gvn,simplifycfg)' < %s -S | FileCheck %s
+
+define float @powi_f32(float %base, i32 %exp) {
+; CHECK-LABEL: define float @powi_f32(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.not = icmp eq i32 %exp, 0
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %result = phi float [ %mul, %while.body ], [ 1.000000e+00, %entry ]
+  %merge.dec = phi i32 [ %dec, %while.body ], [ %exp, %entry ]
+  %mul = fmul fast float %result, %base
+  %dec = add nsw i32 %merge.dec, -1
+  %cmp.eq = icmp eq i32 %dec, 0
+  br i1 %cmp.eq, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %result.lcssa = phi float [ 1.000000e+00, %entry ], [ %mul, %while.body ]
+  ret float %result.lcssa
+}
+
+define double @powi_f64(double %base, i32 %exp) {
+; CHECK-LABEL: define double @powi_f64(
+; CHECK-SAME: double [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi double [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast double [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi double [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    ret double [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.not = icmp eq i32 %exp, 0
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %result = phi double [ %mul, %while.body ], [ 1.000000e+00, %entry ]
+  %merge.dec = phi i32 [ %dec, %while.body ], [ %exp, %entry ]
+  %mul = fmul fast double %result, %base
+  %dec = add nsw i32 %merge.dec, -1
+  %cmp.eq = icmp eq i32 %dec, 0
+  br i1 %cmp.eq, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %result.lcssa = phi double [ 1.000000e+00, %entry ], [ %mul, %while.body ]
+  ret double %result.lcssa
+}
+
+define double @powi_i16_iv(double %base, i16 %exp) {
+; CHECK-LABEL: define double @powi_i16_iv(
+; CHECK-SAME: double [[BASE:%.*]], i16 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i16 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi double [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i16 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast double [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i16 [[MERGE_DEC]], -1
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i16 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi double [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    ret double [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.not = icmp eq i16 %exp, 0
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %result = phi double [ %mul, %while.body ], [ 1.000000e+00, %entry ]
+  %merge.dec = phi i16 [ %dec, %while.body ], [ %exp, %entry ]
+  %mul = fmul fast double %result, %base
+  %dec = add nsw i16 %merge.dec, -1
+  %cmp.eq = icmp eq i16 %dec, 0
+  br i1 %cmp.eq, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %result.lcssa = phi double [ 1.000000e+00, %entry ], [ %mul, %while.body ]
+  ret double %result.lcssa
+}
+
+define float @powi_canonical_iv_signed(float %base, i32 %exp) {
+; CHECK-LABEL: define float @powi_canonical_iv_signed(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_SGT]], label [[FOR_BODY:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[EXP]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[FOR_BODY]] ]
+; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.sgt = icmp sgt i32 %exp, 0
+  br i1 %cmp.sgt, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result = phi float [ %mul, %for.body ], [ 1.000000e+00, %entry ]
+  %mul = fmul fast float %result, %base
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, %exp
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                             ; preds = %for.body, %entry
+  %result.lcssa = phi float [ 1.000000e+00, %entry ], [ %mul, %for.body ]
+  ret float %result.lcssa
+}
+
+define float @powi_canonical_iv_unsigned(float %base, i32 %exp) {
+; CHECK-LABEL: define float @powi_canonical_iv_unsigned(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[EXIT:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[EXP]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[FOR_BODY]] ]
+; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.eq = icmp eq i32 %exp, 0
+  br i1 %cmp.eq, label %exit, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %result = phi float [ %mul, %for.body ], [ 1.000000e+00, %entry ]
+  %mul = fmul fast float %result, %base
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, %exp
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                             ; preds = %for.body, %entry
+  %result.lcssa = phi float [ 1.000000e+00, %entry ], [ %mul, %for.body ]
+  ret float %result.lcssa
+}
+
+define float @powi_const_i32_exp(float %base) {
+; CHECK-LABEL: define float @powi_const_i32_exp(
+; CHECK-SAME: float [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 2147483647
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret float [[MUL]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %result = phi float [ 1.000000e+00, %entry ], [ %mul, %for.body ]
+  %mul = fmul fast float %result, %base
+  %inc = add nuw nsw i32 %iv, 1
+  %cmp = icmp eq i32 %inc, 2147483647
+  br i1 %cmp, label %exit, label %for.body
+
+exit:                                             ; preds = %for.body
+  ret float %mul
+}
+
+define float @powi_unrelated_computation(float %base, i32 %exp) {
+; CHECK-LABEL: define float @powi_unrelated_computation(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[UNRELATED:%.*]] = phi i32 [ [[UNRELATED_XOR:%.*]], [[FOR_BODY]] ], [ 5, [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[UNRELATED_XOR]] = xor i32 [[IV]], [[UNRELATED]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[EXP]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cleanup:
+; CHECK-NEXT:    [[TMP0:%.*]] = sitofp i32 [[UNRELATED_XOR]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast float [[MUL]], [[TMP0]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD:%.*]] = phi float [ 6.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[FOR_CLEANUP]] ]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+entry:
+  %cmp = icmp sgt i32 %exp, 0
+  br i1 %cmp, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %unrelated = phi i32 [ %unrelated.xor, %for.body ], [ 5, %entry ]
+  %result = phi float [ %mul, %for.body ], [ 1.000000e+00, %entry ]
+  %mul = fmul fast float %result, %base
+  %unrelated.xor = xor i32 %iv, %unrelated
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, %exp
+  br i1 %exitcond, label %for.cleanup, label %for.body
+
+for.cleanup:                                     ; preds = %for.body
+  %0 = sitofp i32 %unrelated.xor to float
+  %1 = fadd fast float %mul, %0
+  br label %exit
+
+exit:                                           ; preds = %for.cleanup, %entry
+  %add = phi float [ 6.000000e+00, %entry ], [ %1, %for.cleanup ]
+  ret float %add
+}
+
+define float @powi_afn(float %base, i32 %exp) {
+; CHECK-LABEL: define float @powi_afn(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul afn float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.not = icmp eq i32 %exp, 0
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %result = phi float [ %mul, %while.body ], [ 1.000000e+00, %entry ]
+  %merge.dec = phi i32 [ %dec, %while.body ], [ %exp, %entry ]
+  %mul = fmul afn float %result, %base
+  %dec = add nsw i32 %merge.dec, -1
+  %cmp.eq = icmp eq i32 %dec, 0
+  br i1 %cmp.eq, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %result.lcssa = phi float [ 1.000000e+00, %entry ], [ %mul, %while.body ]
+  ret float %result.lcssa
+}
+
+; Negative tests
+
+; The powi idiom is only legal for a base of floating-point type
+define i32 @powi_i32_base(i32 %base, i32 %exp) {
+; CHECK-LABEL: define i32 @powi_i32_base(
+; CHECK-SAME: i32 [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = mul nsw i32 [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.not = icmp eq i32 %exp, 0
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %result = phi i32 [ %mul, %while.body ], [ 1, %entry ]
+  %merge.dec = phi i32 [ %dec, %while.body ], [ %exp, %entry ]
+  %mul = mul nsw i32 %result, %base
+  %dec = add nsw i32 %merge.dec, -1
+  %cmp.eq = icmp eq i32 %dec, 0
+  br i1 %cmp.eq, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %result.lcssa = phi i32 [ 1, %entry ], [ %mul, %while.body ]
+  ret i32 %result.lcssa
+}
+
+; The powi idiom is only legal when fmul is atleast an afn
+define float @powi_noafn(float %base, i32 %exp) {
+; CHECK-LABEL: define float @powi_noafn(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.not = icmp eq i32 %exp, 0
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %result = phi float [ %mul, %while.body ], [ 1.000000e+00, %entry ]
+  %merge.dec = phi i32 [ %dec, %while.body ], [ %exp, %entry ]
+  %mul = fmul float %result, %base
+  %dec = add nsw i32 %merge.dec, -1
+  %cmp.eq = icmp eq i32 %dec, 0
+  br i1 %cmp.eq, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %result.lcssa = phi float [ 1.000000e+00, %entry ], [ %mul, %while.body ]
+  ret float %result.lcssa
+}
+
+; llvm.powi can only be generated for an exponent that is, at most, a C int type.
+define double @powi_i64_iv(double %base, i64 %exp) {
+; CHECK-LABEL: define double @powi_i64_iv(
+; CHECK-SAME: double [[BASE:%.*]], i64 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi double [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i64 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast double [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i64 [[MERGE_DEC]], -1
+; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i64 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi double [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    ret double [[RESULT_LCSSA]]
+;
+entry:
+  %cmp.not = icmp eq i64 %exp, 0
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %result = phi double [ %mul, %while.body ], [ 1.000000e+00, %entry ]
+  %merge.dec = phi i64 [ %dec, %while.body ], [ %exp, %entry ]
+  %mul = fmul fast double %result, %base
+  %dec = add nsw i64 %merge.dec, -1
+  %cmp.eq = icmp eq i64 %dec, 0
+  br i1 %cmp.eq, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %result.lcssa = phi double [ 1.000000e+00, %entry ], [ %mul, %while.body ]
+  ret double %result.lcssa
+}
+
+define float @powi_const_i64_iv(float %base) {
+; CHECK-LABEL: define float @powi_const_i64_iv(
+; CHECK-SAME: float [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[EXP:%.*]] = phi i64 [ 2147483648, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[DEC]] = add nsw i64 [[EXP]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret float [[MUL]]
+;
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %exp = phi i64 [ 2147483648, %entry ], [ %dec, %while.body ]
+  %result = phi float [ 1.000000e+00, %entry ], [ %mul, %while.body ]
+  %mul = fmul fast float %result, %base
+  %dec = add nsw i64 %exp, -1
+  %cmp = icmp eq i64 %dec, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body
+  ret float %mul
+}
+
+; Converting to powi idiom is only profitable if the result of the fmul is used
+; exclusively outside the loop.
+define float @powi_fmul_result_used_in_loop(float %base, i32 %exp) {
+; CHECK-LABEL: define float @powi_fmul_result_used_in_loop(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[EXP]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[UNRELATED:%.*]] = phi i32 [ [[UNRELATED_XOR:%.*]], [[FOR_BODY]] ], [ 5, [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[MUL]] to i32
+; CHECK-NEXT:    [[UNRELATED_XOR]] = xor i32 [[CONV]], [[UNRELATED]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[EXP]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cleanup:
+; CHECK-NEXT:    [[TMP0:%.*]] = sitofp i32 [[UNRELATED_XOR]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast float [[MUL]], [[TMP0]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD:%.*]] = phi float [ 6.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[FOR_CLEANUP]] ]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+entry:
+  %cmp = icmp sgt i32 %exp, 0
+  br i1 %cmp, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %unrelated = phi i32 [ %unrelated.xor, %for.body ], [ 5, %entry ]
+  %result = phi float [ %mul, %for.body ], [ 1.000000e+00, %entry ]
+  %mul = fmul fast float %result, %base
+  %conv = fptosi float %mul to i32
+  %unrelated.xor = xor i32 %conv, %unrelated
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, %exp
+  br i1 %exitcond, label %for.cleanup, label %for.body
+
+for.cleanup:                                     ; preds = %for.body
+  %0 = sitofp i32 %unrelated.xor to float
+  %1 = fadd fast float %mul, %0
+  br label %exit
+
+exit:                                           ; preds = %for.cleanup, %entry
+  %add = phi float [ 6.000000e+00, %entry ], [ %1, %for.cleanup ]
+  ret float %add
+}

>From 81d6b53734c00f591277f425fabd79c0e6e19348 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra at imgtec.com>
Date: Fri, 17 Nov 2023 12:35:28 +0000
Subject: [PATCH 2/2] LoopIdiomRecognize: detect and convert powi idiom

The following code, when compiled under -ffast-math, produces bad
codegen due to LoopVectorize:

  float powi(float base, int exp) {
    float result = 1.0;
      for (int i = 0; i < exp; ++i)
        result *= base;
    return result;
  }

It can easily be replaced with the llvm.powi intrinsic, when the
exponent is a C int type. This is the job of LoopIdiomRecognize, and has
been marked as a TODO item for years. Fulfill this wish, and replace
computations of this form with the llvm.powi intrinsic.
---
 .../Transforms/Scalar/LoopIdiomRecognize.h    |   3 +
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 176 +++++++++++++++---
 llvm/test/Transforms/LoopIdiom/powi.ll        | 112 +++--------
 3 files changed, 179 insertions(+), 112 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
index 0c6406d86185185..e8692d3726d57ae 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
@@ -34,6 +34,9 @@ struct DisableLIRP {
 
   /// When true, Memcpy is disabled.
   static bool Memcpy;
+
+  // When true, Powi is disabled.
+  static bool Powi;
 };
 
 /// Performs Loop Idiom Recognize Pass.
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 3721564890ddb4e..8e1cc1cb784442f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -22,8 +22,6 @@
 //
 // Future loop memory idioms to recognize:
 //   memcmp, strlen, etc.
-// Future floating point idioms to recognize in -ffast-math mode:
-//   fpowi
 //
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).
@@ -94,6 +92,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "loop-idiom"
 
@@ -129,6 +128,14 @@ static cl::opt<bool, true>
                       cl::location(DisableLIRP::Memcpy), cl::init(false),
                       cl::ReallyHidden);
 
+bool DisableLIRP::Powi;
+static cl::opt<bool, true>
+    DisableLIRPPowi("disable-" DEBUG_TYPE "-powi",
+                    cl::desc("Proceed with loop idiom recognize pass, but do "
+                             "not convert the powi idiom."),
+                    cl::location(DisableLIRP::Powi), cl::init(false),
+                    cl::ReallyHidden);
+
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
     cl::desc("Use loop idiom recognition code size heuristics when compiling"
@@ -206,6 +213,7 @@ class LoopIdiomRecognize {
       const SCEV *BECount);
   bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+  bool processLoopPowi(const SCEV *BECount);
 
   bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
                                MaybeAlign StoreAlignment, Value *StoredVal,
@@ -298,13 +306,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   ApplyCodeSizeHeuristics =
       L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
 
-  HasMemset = TLI->has(LibFunc_memset);
-  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
-  HasMemcpy = TLI->has(LibFunc_memcpy);
-
-  if (HasMemset || HasMemsetPattern || HasMemcpy)
-    if (SE->hasLoopInvariantBackedgeTakenCount(L))
-      return runOnCountableLoop();
+  if (SE->hasLoopInvariantBackedgeTakenCount(L))
+    return runOnCountableLoop();
 
   return runOnNoncountableLoop();
 }
@@ -549,33 +552,44 @@ bool LoopIdiomRecognize::runOnLoopBlock(
     BasicBlock *BB, const SCEV *BECount,
     SmallVectorImpl<BasicBlock *> &ExitBlocks) {
   // We can only promote stores in this block if they are unconditionally
-  // executed in the loop.  For a block to be unconditionally executed, it has
-  // to dominate all the exit blocks of the loop.  Verify this now.
+  // executed in the loop. The powi idiom also requires the block to be
+  // unconditionally executed. For a block to be unconditionally executed, it
+  // has to dominate all the exit blocks of the loop.
   for (BasicBlock *ExitBlock : ExitBlocks)
     if (!DT->dominates(BB, ExitBlock))
       return false;
 
   bool MadeChange = false;
-  // Look for store instructions, which may be optimized to memset/memcpy.
-  collectStores(BB);
 
-  // Look for a single store or sets of stores with a common base, which can be
-  // optimized into a memset (memset_pattern).  The latter most commonly happens
-  // with structs and handunrolled loops.
-  for (auto &SL : StoreRefsForMemset)
-    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
+  HasMemset = TLI->has(LibFunc_memset);
+  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc_memcpy);
+
+  if (HasMemset || HasMemsetPattern || HasMemcpy) {
+    // Look for store instructions, which may be optimized to memset/memcpy.
+    collectStores(BB);
 
-  for (auto &SL : StoreRefsForMemsetPattern)
-    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
+    // Look for a single store or sets of stores with a common base, which can
+    // be optimized into a memset (memset_pattern).  The latter most commonly
+    // happens with structs and handunrolled loops.
+    for (auto &SL : StoreRefsForMemset)
+      MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
 
-  // Optimize the store into a memcpy, if it feeds an similarly strided load.
-  for (auto &SI : StoreRefsForMemcpy)
-    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+    for (auto &SL : StoreRefsForMemsetPattern)
+      MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
+
+    // Optimize the store into a memcpy, if it feeds an similarly strided load.
+    for (auto &SI : StoreRefsForMemcpy)
+      MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+    MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
+        BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
+    MadeChange |= processLoopMemIntrinsic<MemSetInst>(
+        BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
+  }
 
-  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
-      BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
-  MadeChange |= processLoopMemIntrinsic<MemSetInst>(
-      BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
+  if (!DisableLIRP::Powi)
+    MadeChange |= processLoopPowi(BECount);
 
   return MadeChange;
 }
@@ -925,6 +939,112 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
                                  BECount, IsNegStride, /*IsLoopMemset=*/true);
 }
 
+static CallInst *createPowiIntrinsic(IRBuilder<> &IRBuilder, Value *Base,
+                                     Value *Exp, const DebugLoc &DL) {
+  Value *Ops[] = {Base, Exp};
+  Type *Tys[] = {Base->getType(), Exp->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::powi, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+  return CI;
+}
+
+// Checks that the Phi is an fmul fast with a loop-invariant operand, and
+// returns the the fmul instruction.
+static Instruction *detectPowiIdiom(PHINode *Phi, BasicBlock *PH,
+                                    BasicBlock *Latch, Loop *CurLoop) {
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Performing powi idiom detection\n");
+
+  // The phi must have two incoming values (one from the preheader, and another
+  // from the latch), it must have one use (which we will subsequently check is
+  // an fmul fast instruction), and it must be a floating-point type.
+  if (Phi->getNumIncomingValues() != 2 || !Phi->hasOneUse() ||
+      Phi->getBasicBlockIndex(PH) < 0 || Phi->getBasicBlockIndex(Latch) < 0 ||
+      !Phi->getType()->isFloatingPointTy()) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Unable to operate on this PHI node\n");
+    return nullptr;
+  }
+
+  // Further, check that the incoming value from the preheader is 1.0.
+  auto *ConstFP = dyn_cast<ConstantFP>(Phi->getIncomingValueForBlock(PH));
+  if (!ConstFP || !ConstFP->isExactlyValue(1.0)) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Initial value comparison failed\n");
+    return nullptr;
+  }
+
+  auto *I = cast<Instruction>(Phi->use_begin()->getUser());
+  Value *Op1, *Op2;
+  if (!match(I, m_FMul(m_Value(Op1), m_Value(Op2))) || !I->hasApproxFunc()) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " fmul-afn test failed\n");
+    return nullptr;
+  }
+  for (Use &U : I->uses()) {
+    if (isa<PHINode>(U.getUser()))
+      continue;
+    if (U->isUsedInBasicBlock(Latch)) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE " FMul used inside loop\n");
+      return nullptr;
+    }
+  }
+  Value *Base = Op1 == Phi ? Op2 : Op1;
+  if (CurLoop->isLoopInvariant(Base))
+    return I;
+  else
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Base is not loop-invariant\n");
+  return nullptr;
+}
+
+/// Detect the powi idiom, and convert it to an intrinsic.
+bool LoopIdiomRecognize::processLoopPowi(const SCEV *BECount) {
+  // We only process loops where the IV is found, and at most i32.
+  PHINode *IV = CurLoop->getInductionVariable(*SE);
+  if (!IV || IV->getType()->getScalarSizeInBits() > 32)
+    return false;
+
+  // If the loop doesn't have a valid preheader and latch, give up now.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  BasicBlock *Latch = CurLoop->getLoopLatch();
+  if (!PH || !Latch)
+    return false;
+
+  // Find the Phi corresponding to the powi idiom, amongst all phis except the
+  // induction phi.
+  for (PHINode &Phi : Latch->phis()) {
+    if (&Phi == IV)
+      continue;
+    if (Instruction *FMul = detectPowiIdiom(&Phi, PH, Latch, CurLoop)) {
+      // Find the trip count, and expand the SCEV to find the exponent of the
+      // powi.
+      IRBuilder<> Builder(PH->getTerminator());
+      SCEVExpander Expander(*SE, *DL, "loop-idiom");
+      SCEVExpanderCleaner ExpCleaner(Expander);
+      Type *ExpTy = Builder.getInt32Ty();
+      const SCEV *TripCount =
+          SE->getTripCountFromExitCount(BECount, ExpTy, CurLoop);
+      if (!Expander.isSafeToExpand(TripCount)) {
+        LLVM_DEBUG(dbgs() << DEBUG_TYPE " Trip count not safe to expand\n");
+        return false;
+      }
+      Value *Exp =
+          Expander.expandCodeFor(TripCount, ExpTy, PH->getTerminator());
+
+      // Insert the powi intrinsic, and replace its uses outside the block.
+      const DebugLoc &Loc = FMul->getDebugLoc();
+      Value *Base = isa<PHINode>(FMul->getOperand(0)) ? FMul->getOperand(1)
+                                                      : FMul->getOperand(0);
+      CallInst *Powi = createPowiIntrinsic(Builder, Base, Exp, Loc);
+      FMul->replaceUsesOutsideBlock(Powi, Latch);
+      ExpCleaner.markResultUsed();
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE " powi idiom optimized!\n");
+      return true;
+    }
+  }
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " powi idiom detection failed\n");
+  return false;
+}
+
 /// mayLoopAccessLocation - Return true if the specified loop might access the
 /// specified pointer location, which is a loop-strided access.  The 'Access'
 /// argument specifies what the verboten forms of access are (read or write).
@@ -2216,8 +2336,6 @@ static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
   BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
   assert(LoopPreheaderBB && "There is always a loop preheader.");
 
-  using namespace PatternMatch;
-
   // Step 1: Check if the loop backedge is in desirable form.
 
   ICmpInst::Predicate Pred;
diff --git a/llvm/test/Transforms/LoopIdiom/powi.ll b/llvm/test/Transforms/LoopIdiom/powi.ll
index 807425c214ea692..f3bc4d6868a4b21 100644
--- a/llvm/test/Transforms/LoopIdiom/powi.ll
+++ b/llvm/test/Transforms/LoopIdiom/powi.ll
@@ -6,17 +6,9 @@ define float @powi_f32(float %base, i32 %exp) {
 ; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
-; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
-; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.powi.f32.i32(float [[BASE]], i32 [[EXP]])
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP_NOT]], float 1.000000e+00, float [[TMP0]]
+; CHECK-NEXT:    ret float [[SPEC_SELECT]]
 ;
 entry:
   %cmp.not = icmp eq i32 %exp, 0
@@ -40,17 +32,9 @@ define double @powi_f64(double %base, i32 %exp) {
 ; CHECK-SAME: double [[BASE:%.*]], i32 [[EXP:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[RESULT:%.*]] = phi double [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul fast double [[RESULT]], [[BASE]]
-; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
-; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi double [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    ret double [[RESULT_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.powi.f64.i32(double [[BASE]], i32 [[EXP]])
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP_NOT]], double 1.000000e+00, double [[TMP0]]
+; CHECK-NEXT:    ret double [[SPEC_SELECT]]
 ;
 entry:
   %cmp.not = icmp eq i32 %exp, 0
@@ -74,16 +58,9 @@ define double @powi_i16_iv(double %base, i16 %exp) {
 ; CHECK-SAME: double [[BASE:%.*]], i16 [[EXP:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i16 [[EXP]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[RESULT:%.*]] = phi double [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i16 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul fast double [[RESULT]], [[BASE]]
-; CHECK-NEXT:    [[DEC]] = add nsw i16 [[MERGE_DEC]], -1
-; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i16 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi double [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[EXP]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.powi.f64.i32(double [[BASE]], i32 [[TMP0]])
+; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = select i1 [[CMP_NOT]], double 1.000000e+00, double [[TMP1]]
 ; CHECK-NEXT:    ret double [[RESULT_LCSSA]]
 ;
 entry:
@@ -108,17 +85,9 @@ define float @powi_canonical_iv_signed(float %base, i32 %exp) {
 ; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i32 [[EXP]], 0
-; CHECK-NEXT:    br i1 [[CMP_SGT]], label [[FOR_BODY:%.*]], label [[EXIT:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[EXP]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.powi.f32.i32(float [[BASE]], i32 [[EXP]])
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP_SGT]], float [[TMP0]], float 1.000000e+00
+; CHECK-NEXT:    ret float [[SPEC_SELECT]]
 ;
 entry:
   %cmp.sgt = icmp sgt i32 %exp, 0
@@ -142,17 +111,9 @@ define float @powi_canonical_iv_unsigned(float %base, i32 %exp) {
 ; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[EXP]], 0
-; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[EXIT:%.*]], label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[EXP]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.powi.f32.i32(float [[BASE]], i32 [[EXP]])
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP_EQ]], float 1.000000e+00, float [[TMP0]]
+; CHECK-NEXT:    ret float [[SPEC_SELECT]]
 ;
 entry:
   %cmp.eq = icmp eq i32 %exp, 0
@@ -175,16 +136,8 @@ define float @powi_const_i32_exp(float %base) {
 ; CHECK-LABEL: define float @powi_const_i32_exp(
 ; CHECK-SAME: float [[BASE:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 2147483647
-; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret float [[MUL]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.powi.f32.i32(float [[BASE]], i32 2147483647)
+; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
   br label %for.body
@@ -206,22 +159,23 @@ define float @powi_unrelated_computation(float %base, i32 %exp) {
 ; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[EXP]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.powi.f32.i32(float [[BASE]], i32 [[EXP]])
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[UNRELATED:%.*]] = phi i32 [ [[UNRELATED_XOR:%.*]], [[FOR_BODY]] ], [ 5, [[ENTRY]] ]
-; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul fast float [[RESULT]], [[BASE]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[UNRELATED:%.*]] = phi i32 [ [[UNRELATED_XOR:%.*]], [[FOR_BODY]] ], [ 5, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[UNRELATED_XOR]] = xor i32 [[IV]], [[UNRELATED]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[EXP]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_CLEANUP:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.cleanup:
-; CHECK-NEXT:    [[TMP0:%.*]] = sitofp i32 [[UNRELATED_XOR]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast float [[MUL]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp i32 [[UNRELATED_XOR]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast float [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[ADD:%.*]] = phi float [ 6.000000e+00, [[ENTRY]] ], [ [[TMP1]], [[FOR_CLEANUP]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = phi float [ 6.000000e+00, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_CLEANUP]] ]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
 entry:
@@ -253,17 +207,9 @@ define float @powi_afn(float %base, i32 %exp) {
 ; CHECK-SAME: float [[BASE:%.*]], i32 [[EXP:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[EXP]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[RESULT:%.*]] = phi float [ [[MUL:%.*]], [[WHILE_BODY]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[MERGE_DEC:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[EXP]], [[ENTRY]] ]
-; CHECK-NEXT:    [[MUL]] = fmul afn float [[RESULT]], [[BASE]]
-; CHECK-NEXT:    [[DEC]] = add nsw i32 [[MERGE_DEC]], -1
-; CHECK-NEXT:    [[CMP_EQ:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_EQ]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[RESULT_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[MUL]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    ret float [[RESULT_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.powi.f32.i32(float [[BASE]], i32 [[EXP]])
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP_NOT]], float 1.000000e+00, float [[TMP0]]
+; CHECK-NEXT:    ret float [[SPEC_SELECT]]
 ;
 entry:
   %cmp.not = icmp eq i32 %exp, 0