[llvm] f334db9 - [llvm][CodeGen] Intrinsic `llvm.powi.*` code gen for vector arguments (#118242)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 18 16:57:35 PST 2024
Author: Zhaoxin Yang
Date: 2024-12-19T08:57:31+08:00
New Revision: f334db92be168876b618db72dc93078ce23ffa89
URL: https://github.com/llvm/llvm-project/commit/f334db92be168876b618db72dc93078ce23ffa89
DIFF: https://github.com/llvm/llvm-project/commit/f334db92be168876b618db72dc93078ce23ffa89.diff
LOG: [llvm][CodeGen] Intrinsic `llvm.powi.*` code gen for vector arguments (#118242)
Scalarize vector FPOWI instead of promoting the type. This allows the
scalar FPOWIs to be visited and converted to libcalls before promoting
the type.
FIXME: This should be done in LegalizeVectorOps/LegalizeDAG, but call
lowering needs the unpromoted EVT.
Without this patch, in some backends, such as RISCV64 and LoongArch64,
the i32 type is illegal and will be promoted. This causes exponent type
check to fail when ISD::FPOWI node generates a libcall.
Fix https://github.com/llvm/llvm-project/issues/118079
Added:
llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 69f25ebc88004e..be7521f3416850 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2585,6 +2585,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ExpOp(SDNode *N) {
: RTLIB::getLDEXP(N->getValueType(0));
if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
+ // Scalarize vector FPOWI instead of promoting the type. This allows the
+ // scalar FPOWIs to be visited and converted to libcalls before promoting
+ // the type.
+ // FIXME: This should be done in LegalizeVectorOps/LegalizeDAG, but call
+ // lowering needs the unpromoted EVT.
+ if (IsPowI && N->getValueType(0).isVector())
+ return DAG.UnrollVectorOp(N);
SmallVector<SDValue, 3> NewOps(N->ops());
NewOps[1 + OpOffset] = SExtPromotedInteger(N->getOperand(1 + OpOffset));
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
new file mode 100644
index 00000000000000..f6b14a9bb000fd
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32)
+
+define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
+; CHECK-LABEL: powi_v8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -80
+; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; CHECK-NEXT: xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT: addi.w $fp, $a0, 0
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 0
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 2
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 2
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 3
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 4
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 4
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 5
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 5
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 6
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 6
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7
+; CHECK-NEXT: movgr2fr.w $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 7
+; CHECK-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 80
+; CHECK-NEXT: ret
+entry:
+ %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b)
+ ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32)
+
+define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
+; CHECK-LABEL: powi_v4f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -80
+; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; CHECK-NEXT: xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT: addi.w $fp, $a0, 0
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT: movgr2fr.d $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powidf2)
+; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT: movgr2fr.d $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powidf2)
+; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
+; CHECK-NEXT: movgr2fr.d $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powidf2)
+; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 2
+; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT: movgr2fr.d $fa0, $a0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powidf2)
+; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 3
+; CHECK-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 80
+; CHECK-NEXT: ret
+entry:
+ %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b)
+ ret <4 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
new file mode 100644
index 00000000000000..b0f54e78c7a442
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
+
+define <4 x float> @powi_v4f32(<4 x float> %va, i32 %b) nounwind {
+; CHECK-LABEL: powi_v4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -48
+; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: addi.w $fp, $a0, 0
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 1
+; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 1
+; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 2
+; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 2
+; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
+; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powisf2)
+; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 3
+; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 48
+; CHECK-NEXT: ret
+entry:
+ %res = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %va, i32 %b)
+ ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.powi.v2f64.i32(<2 x double>, i32)
+
+define <2 x double> @powi_v2f64(<2 x double> %va, i32 %b) nounwind {
+; CHECK-LABEL: powi_v2f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -48
+; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: addi.w $fp, $a0, 0
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powidf2)
+; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
+; CHECK-NEXT: move $a0, $fp
+; CHECK-NEXT: bl %plt(__powidf2)
+; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 1
+; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 48
+; CHECK-NEXT: ret
+entry:
+ %res = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> %va, i32 %b)
+ ret <2 x double> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll
new file mode 100644
index 00000000000000..c6b8b602718b76
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll
@@ -0,0 +1,1251 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+v,+f,+d -target-abi=ilp32d -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d -target-abi=lp64d -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefix=RV64
+
+define <1 x float> @powi_v1f32(<1 x float> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v1f32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vfmv.s.f v8, fa0
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v1f32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vfmv.s.f v8, fa0
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %a = call <1 x float> @llvm.powi.v1f32.i32(<1 x float> %x, i32 %y)
+ ret <1 x float> %a
+}
+declare <1 x float> @llvm.powi.v1f32.i32(<1 x float>, i32)
+
+define <2 x float> @powi_v2f32(<2 x float> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v2f32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v9
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fmv.s fs0, fa0
+; RV32-NEXT: flw fa0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vfmv.v.f v8, fa0
+; RV32-NEXT: vfslide1down.vf v8, v8, fs0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v2f32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: addi a1, sp, 32
+; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: sext.w s0, a0
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v9
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fmv.s fs0, fa0
+; RV64-NEXT: flw fa0, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vfmv.v.f v8, fa0
+; RV64-NEXT: vfslide1down.vf v8, v8, fs0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: ret
+ %a = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> %x, i32 %y)
+ ret <2 x float> %a
+}
+declare <2 x float> @llvm.powi.v2f32.i32(<2 x float>, i32)
+
+define <3 x float> @powi_v3f32(<3 x float> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v3f32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v9
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fmv.s fs0, fa0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: flw fa0, 16(a0) # 8-byte Folded Reload
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vfmv.v.f v8, fa0
+; RV32-NEXT: vfslide1down.vf v8, v8, fs0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 2
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v3f32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 32
+; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: sext.w s0, a0
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v9
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fmv.s fs0, fa0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: flw fa0, 32(a0) # 8-byte Folded Reload
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vfmv.v.f v8, fa0
+; RV64-NEXT: vfslide1down.vf v8, v8, fs0
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 2
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: ret
+ %a = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> %x, i32 %y)
+ ret <3 x float> %a
+}
+declare <3 x float> @llvm.powi.v3f32.i32(<3 x float>, i32)
+
+define <4 x float> @powi_v4f32(<4 x float> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v4f32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v9
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fmv.s fs0, fa0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: flw fa0, 16(a0) # 8-byte Folded Reload
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vfmv.v.f v8, fa0
+; RV32-NEXT: vfslide1down.vf v8, v8, fs0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 2
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v4f32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 32
+; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: sext.w s0, a0
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v9
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fmv.s fs0, fa0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: flw fa0, 32(a0) # 8-byte Folded Reload
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vfmv.v.f v8, fa0
+; RV64-NEXT: vfslide1down.vf v8, v8, fs0
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 2
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: ret
+ %a = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %x, i32 %y)
+ ret <4 x float> %a
+}
+declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
+
+define <8 x float> @powi_v8f32(<8 x float> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v10
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fmv.s fs0, fa0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vfmv.v.f v8, fa0
+; RV32-NEXT: vfslide1down.vf v8, v8, fs0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 2
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 4
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 5
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 6
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 32
+; RV64-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: sext.w s0, a0
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v10
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fmv.s fs0, fa0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vfmv.v.f v8, fa0
+; RV64-NEXT: vfslide1down.vf v8, v8, fs0
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 2
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 4
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 5
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 6
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: ret
+ %a = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %x, i32 %y)
+ ret <8 x float> %a
+}
+declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32)
+
+define <16 x float> @powi_v16f32(<16 x float> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v16f32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -272
+; RV32-NEXT: sw ra, 268(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 264(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 260(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 272
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: mv s2, a0
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vs4r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: flw fa0, 124(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 188(sp)
+; RV32-NEXT: flw fa0, 120(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 184(sp)
+; RV32-NEXT: flw fa0, 116(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 180(sp)
+; RV32-NEXT: flw fa0, 112(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 176(sp)
+; RV32-NEXT: flw fa0, 108(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 172(sp)
+; RV32-NEXT: flw fa0, 104(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 168(sp)
+; RV32-NEXT: flw fa0, 100(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 164(sp)
+; RV32-NEXT: flw fa0, 96(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 160(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 128(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 140(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 2
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 136(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 132(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 156(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 6
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 152(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 5
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 148(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 4
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powisf2
+; RV32-NEXT: fsw fa0, 144(sp)
+; RV32-NEXT: addi a0, sp, 128
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -272
+; RV32-NEXT: lw ra, 268(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 264(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 260(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 272
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v16f32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -272
+; RV64-NEXT: sd ra, 264(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 256(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 248(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 272
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: addi a1, sp, 240
+; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: addi a1, sp, 64
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v8, (a1)
+; RV64-NEXT: flw fa0, 124(sp)
+; RV64-NEXT: sext.w s2, a0
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 188(sp)
+; RV64-NEXT: flw fa0, 120(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 184(sp)
+; RV64-NEXT: flw fa0, 116(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 180(sp)
+; RV64-NEXT: flw fa0, 112(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 176(sp)
+; RV64-NEXT: flw fa0, 108(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 172(sp)
+; RV64-NEXT: flw fa0, 104(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 168(sp)
+; RV64-NEXT: flw fa0, 100(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 164(sp)
+; RV64-NEXT: flw fa0, 96(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 160(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 128(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 140(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 2
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 136(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 132(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 156(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 6
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 152(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 5
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 148(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 4
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powisf2
+; RV64-NEXT: fsw fa0, 144(sp)
+; RV64-NEXT: addi a0, sp, 128
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -272
+; RV64-NEXT: ld ra, 264(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 256(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 248(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 272
+; RV64-NEXT: ret
+ %a = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> %x, i32 %y)
+ ret <16 x float> %a
+}
+declare <16 x float> @llvm.powi.v16f32.i32(<16 x float>, i32)
+
+define <1 x double> @powi_v1f64(<1 x double> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v1f64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vfmv.s.f v8, fa0
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v1f64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vfmv.s.f v8, fa0
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %a = call <1 x double> @llvm.powi.v1f64.i32(<1 x double> %x, i32 %y)
+ ret <1 x double> %a
+}
+declare <1 x double> @llvm.powi.v1f64.i32(<1 x double>, i32)
+
+define <2 x double> @powi_v2f64(<2 x double> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v2f64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v9
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fmv.d fs0, fa0
+; RV32-NEXT: fld fa0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vfmv.v.f v8, fa0
+; RV32-NEXT: vfslide1down.vf v8, v8, fs0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v2f64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: addi a1, sp, 32
+; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: sext.w s0, a0
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v9
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fmv.d fs0, fa0
+; RV64-NEXT: fld fa0, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vfmv.v.f v8, fa0
+; RV64-NEXT: vfslide1down.vf v8, v8, fs0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: ret
+ %a = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> %x, i32 %y)
+ ret <2 x double> %a
+}
+declare <2 x double> @llvm.powi.v2f64.i32(<2 x double>, i32)
+
+define <4 x double> @powi_v4f64(<4 x double> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v4f64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v10
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fmv.d fs0, fa0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vfmv.v.f v8, fa0
+; RV32-NEXT: vfslide1down.vf v8, v8, fs0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 2
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vfslide1down.vf v8, v8, fa0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v4f64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 32
+; RV64-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: sext.w s0, a0
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v10
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fmv.d fs0, fa0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vfmv.v.f v8, fa0
+; RV64-NEXT: vfslide1down.vf v8, v8, fs0
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 2
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vfslide1down.vf v8, v8, fa0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: ret
+ %a = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %x, i32 %y)
+ ret <4 x double> %a
+}
+declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32)
+
+define <8 x double> @powi_v8f64(<8 x double> %x, i32 %y) nounwind {
+; RV32-LABEL: powi_v8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -272
+; RV32-NEXT: sw ra, 268(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 264(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 260(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 272
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: mv s2, a0
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vs4r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: fld fa0, 120(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 184(sp)
+; RV32-NEXT: fld fa0, 112(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 176(sp)
+; RV32-NEXT: fld fa0, 104(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 168(sp)
+; RV32-NEXT: fld fa0, 96(sp)
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 160(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 128(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 136(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 152(sp)
+; RV32-NEXT: addi a0, sp, 256
+; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 2
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: call __powidf2
+; RV32-NEXT: fsd fa0, 144(sp)
+; RV32-NEXT: addi a0, sp, 128
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vle64.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -272
+; RV32-NEXT: lw ra, 268(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 264(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 260(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 272
+; RV32-NEXT: ret
+;
+; RV64-LABEL: powi_v8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -272
+; RV64-NEXT: sd ra, 264(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 256(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 248(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 272
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: addi a1, sp, 240
+; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: addi a1, sp, 64
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vse64.v v8, (a1)
+; RV64-NEXT: fld fa0, 120(sp)
+; RV64-NEXT: sext.w s2, a0
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 184(sp)
+; RV64-NEXT: fld fa0, 112(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 176(sp)
+; RV64-NEXT: fld fa0, 104(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 168(sp)
+; RV64-NEXT: fld fa0, 96(sp)
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 160(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 128(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 136(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 152(sp)
+; RV64-NEXT: addi a0, sp, 240
+; RV64-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 2
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: call __powidf2
+; RV64-NEXT: fsd fa0, 144(sp)
+; RV64-NEXT: addi a0, sp, 128
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vle64.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -272
+; RV64-NEXT: ld ra, 264(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 256(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 248(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 272
+; RV64-NEXT: ret
+ %a = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> %x, i32 %y)
+ ret <8 x double> %a
+}
+declare <8 x double> @llvm.powi.v8f64.i32(<8 x double>, i32)
More information about the llvm-commits
mailing list