[Mlir-commits] [mlir] 322d0af - [llvm][mlir] Promote the experimental reduction intrinsics to be first class intrinsics.

Wed Oct 7 10:39:04 PDT 2020

Author: Amara Emerson
Date: 2020-10-07T10:36:44-07:00
New Revision: 322d0afd875df66b36e4810a2b95c20a8f22ab9b

URL: https://github.com/llvm/llvm-project/commit/322d0afd875df66b36e4810a2b95c20a8f22ab9b
DIFF: https://github.com/llvm/llvm-project/commit/322d0afd875df66b36e4810a2b95c20a8f22ab9b.diff

LOG: [llvm][mlir] Promote the experimental reduction intrinsics to be first class intrinsics.

This change renames the intrinsics to not have "experimental" in the name.

The autoupgrader will handle legacy intrinsics.

Relevant ML thread: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html

Differential Revision: https://reviews.llvm.org/D88787

Added: 
    llvm/test/Instrumentation/MemorySanitizer/reduce.ll

Modified: 
    llvm/docs/LangRef.rst
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/lib/Analysis/ConstantFolding.cpp
    llvm/lib/CodeGen/ExpandReductions.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/IR/AutoUpgrade.cpp
    llvm/lib/IR/IRBuilder.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/lib/Target/ARM/MVETailPredication.cpp
    llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
    llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
    llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
    llvm/test/Analysis/CostModel/ARM/reduce-add.ll
    llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
    llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
    llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
    llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
    llvm/test/Analysis/CostModel/X86/reduce-add.ll
    llvm/test/Analysis/CostModel/X86/reduce-and.ll
    llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
    llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
    llvm/test/Analysis/CostModel/X86/reduce-mul.ll
    llvm/test/Analysis/CostModel/X86/reduce-or.ll
    llvm/test/Analysis/CostModel/X86/reduce-smax.ll
    llvm/test/Analysis/CostModel/X86/reduce-smin.ll
    llvm/test/Analysis/CostModel/X86/reduce-umax.ll
    llvm/test/Analysis/CostModel/X86/reduce-umin.ll
    llvm/test/Analysis/CostModel/X86/reduce-xor.ll
    llvm/test/Assembler/invalid-vecreduce.ll
    llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll
    llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc
    llvm/test/CodeGen/AArch64/aarch64-addv.ll
    llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
    llvm/test/CodeGen/AArch64/arm64-vabs.ll
    llvm/test/CodeGen/AArch64/neon-dot-product.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
    llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-bool.ll
    llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
    llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
    llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
    llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
    llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll
    llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
    llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
    llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
    llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll
    llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
    llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
    llvm/test/CodeGen/Thumb2/mve-vaddv.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
    llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
    llvm/test/CodeGen/X86/haddsub.ll
    llvm/test/CodeGen/X86/pr45378.ll
    llvm/test/CodeGen/X86/vector-reduce-add.ll
    llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
    llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
    llvm/test/CodeGen/X86/vector-reduce-and.ll
    llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
    llvm/test/CodeGen/X86/vector-reduce-fadd.ll
    llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
    llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
    llvm/test/CodeGen/X86/vector-reduce-fmax.ll
    llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
    llvm/test/CodeGen/X86/vector-reduce-fmin.ll
    llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
    llvm/test/CodeGen/X86/vector-reduce-fmul.ll
    llvm/test/CodeGen/X86/vector-reduce-mul.ll
    llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
    llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
    llvm/test/CodeGen/X86/vector-reduce-or.ll
    llvm/test/CodeGen/X86/vector-reduce-smax.ll
    llvm/test/CodeGen/X86/vector-reduce-smin.ll
    llvm/test/CodeGen/X86/vector-reduce-umax.ll
    llvm/test/CodeGen/X86/vector-reduce-umin.ll
    llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
    llvm/test/CodeGen/X86/vector-reduce-xor.ll
    llvm/test/Transforms/InstCombine/vector-reductions.ll
    llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
    llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
    llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
    llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
    llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
    llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
    llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
    llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
    llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
    llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
    llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
    llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
    llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
    llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
    llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
    llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
    llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
    llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
    llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
    llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
    llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
    llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
    llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
    llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
    llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
    mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
    mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
    mlir/include/mlir/Dialect/Vector/VectorOps.td
    mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir
    mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir
    mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
    mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir
    mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
    mlir/test/Target/llvmir-intrinsics.mlir

Removed: 
    llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll


################################################################################
diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index cd5c9e23b746..269c3675bf33 100644

--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15543,8 +15543,8 @@ should exit, this ``SUB`` is not allowed to wrap. The result is a condition
 that is used by the conditional branch controlling the loop.
 
 
-Experimental Vector Reduction Intrinsics
-----------------------------------------
+Vector Reduction Intrinsics
+---------------------------
 
 Horizontal reductions of vectors can be expressed using the following
 intrinsics. Each one takes a vector operand as an input and applies its
@@ -15552,21 +15552,21 @@ respective operation across all elements of the vector, returning a single
 scalar result of the same element type.
 
 
-'``llvm.experimental.vector.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a)
-      declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a)
+      declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+      declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD``
+The '``llvm.vector.reduce.add.*``' intrinsics do an integer ``ADD``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15574,34 +15574,34 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.v2.fadd.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %start_value, <2 x double> %a)
+      declare float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %a)
+      declare double @llvm.vector.reduce.fadd.v2f64(double %start_value, <2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.v2.fadd.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fadd.*``' intrinsics do a floating-point
 ``ADD`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
-If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
-reduction will not preserve the associativity of an equivalent scalarized
-counterpart. Otherwise the reduction will be *ordered*, thus implying that
-the operation respects the associativity of a scalarized reduction. That is, the
-reduction begins with the start value and performs an fadd operation with consecutively
-increasing vector element indices. See the following pseudocode:
+If the intrinsic call has the 'reassoc' flag set, then the reduction will not
+preserve the associativity of an equivalent scalarized counterpart. Otherwise
+the reduction will be *sequential*, thus implying that the operation respects
+the associativity of a scalarized reduction. That is, the reduction begins with
+the start value and performs an fadd operation with consecutively increasing
+vector element indices. See the following pseudocode:
 
 ::
 
-    float ordered_fadd(start_value, input_vector)
+    float sequential_fadd(start_value, input_vector)
       result = start_value
       for i = 0 to length(input_vector)
         result = result + input_vector[i]
@@ -15619,25 +15619,25 @@ Examples:
 
 ::
 
-      %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction
-      %ord = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
+      %unord = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> %input) ; relaxed reduction
+      %ord = call float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
 
-'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.mul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a)
-      declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a)
+      declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
+      declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
+The '``llvm.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15645,34 +15645,34 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.v2.fmul.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %start_value, <2 x double> %a)
+      declare float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %a)
+      declare double @llvm.vector.reduce.fmul.v2f64(double %start_value, <2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.v2.fmul.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fmul.*``' intrinsics do a floating-point
 ``MUL`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
-If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
-reduction will not preserve the associativity of an equivalent scalarized
-counterpart. Otherwise the reduction will be *ordered*, thus implying that
-the operation respects the associativity of a scalarized reduction. That is, the
-reduction begins with the start value and performs an fmul operation with consecutively
-increasing vector element indices. See the following pseudocode:
+If the intrinsic call has the 'reassoc' flag set, then the reduction will not
+preserve the associativity of an equivalent scalarized counterpart. Otherwise
+the reduction will be *sequential*, thus implying that the operation respects
+the associativity of a scalarized reduction. That is, the reduction begins with
+the start value and performs an fmul operation with consecutively increasing
+vector element indices. See the following pseudocode:
 
 ::
 
-    float ordered_fmul(start_value, input_vector)
+    float sequential_fmul(start_value, input_vector)
       result = start_value
       for i = 0 to length(input_vector)
         result = result * input_vector[i]
@@ -15690,23 +15690,23 @@ Examples:
 
 ::
 
-      %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction
-      %ord = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
+      %unord = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.0, <4 x float> %input) ; relaxed reduction
+      %ord = call float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
-'``llvm.experimental.vector.reduce.and.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.and.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
+The '``llvm.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15714,20 +15714,20 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.or.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.or.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
+The '``llvm.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
 of a vector, returning the result as a scalar. The return type matches the
 element-type of the vector input.
 
@@ -15735,20 +15735,20 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.xor.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.xor.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
+The '``llvm.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
 reduction of a vector, returning the result as a scalar. The return type matches
 the element-type of the vector input.
 
@@ -15756,20 +15756,20 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.smax.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.smax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer
+The '``llvm.vector.reduce.smax.*``' intrinsics do a signed integer
 ``MAX`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
@@ -15777,20 +15777,20 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.smin.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.smin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer
+The '``llvm.vector.reduce.smin.*``' intrinsics do a signed integer
 ``MIN`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
@@ -15798,20 +15798,20 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.umax.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.umax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned
+The '``llvm.vector.reduce.umax.*``' intrinsics do an unsigned
 integer ``MAX`` reduction of a vector, returning the result as a scalar. The
 return type matches the element-type of the vector input.
 
@@ -15819,20 +15819,20 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.umin.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.umin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a)
+      declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned
+The '``llvm.vector.reduce.umin.*``' intrinsics do an unsigned
 integer ``MIN`` reduction of a vector, returning the result as a scalar. The
 return type matches the element-type of the vector input.
 
@@ -15840,21 +15840,21 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
-'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fmax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a)
+      declare float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
+      declare double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fmax.*``' intrinsics do a floating-point
 ``MAX`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 
@@ -15870,8 +15870,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
-'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.vector.reduce.fmin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -15879,13 +15879,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a)
-      declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a)
+      declare float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
+      declare double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating-point
+The '``llvm.vector.reduce.fmin.*``' intrinsics do a floating-point
 ``MIN`` reduction of a vector, returning the result as a scalar. The return type
 matches the element-type of the vector input.
 

diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 47ce9fa10d90..b81547dca6df 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -63,6 +63,10 @@ Changes to the LLVM IR
 * Added the ``byref`` attribute to better represent argument passing
   for the `amdgpu_kernel` calling convention.
 
+* The ``llvm.experimental.vector.reduce`` family of intrinsics have been renamed
+  to drop the "experimental" from the name, reflecting their now fully supported
+  status in the IR.
+
 Changes to building LLVM
 ------------------------
 

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 2a1ec0103d09..f8a357a8479b 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1180,19 +1180,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin:
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin: {
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin: {
       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
       return getIntrinsicInstrCost(Attrs, CostKind);
     }
@@ -1407,46 +1407,46 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
                                             CostKind);
     }
-    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::vector_reduce_add:
       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_mul:
       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::vector_reduce_xor:
       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
+    case Intrinsic::vector_reduce_fadd:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
+    case Intrinsic::vector_reduce_fmul:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
           /*IsPairwiseForm=*/false,
           /*IsUnsigned=*/false, CostKind);
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
           /*IsPairwiseForm=*/false,

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 20c6d3b8cb1c..d05b9982c487 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1452,34 +1452,35 @@ def int_memset_element_unordered_atomic
 //===------------------------ Reduction Intrinsics ------------------------===//
 //
 let IntrProperties = [IntrNoMem, IntrWillReturn] in {
-  def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty],
-                                                         [LLVMMatchType<0>,
-                                                          llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty],
-                                                         [LLVMMatchType<0>,
-                                                          llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
+
+  def int_vector_reduce_fadd : Intrinsic<[LLVMVectorElementType<0>],
+                                         [LLVMVectorElementType<0>,
+                                          llvm_anyvector_ty]>;
+  def int_vector_reduce_fmul : Intrinsic<[LLVMVectorElementType<0>],
+                                         [LLVMVectorElementType<0>,
+                                          llvm_anyvector_ty]>;
+  def int_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>],
+                                       [llvm_anyvector_ty]>;
+  def int_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
 }
 
 //===----- Matrix intrinsics ---------------------------------------------===//

diff  --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 0ddca5a09838..9ae67d074c5c 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1457,15 +1457,15 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::smul_fix_sat:
   case Intrinsic::bitreverse:
   case Intrinsic::is_constant:
-  case Intrinsic::experimental_vector_reduce_add:
-  case Intrinsic::experimental_vector_reduce_mul:
-  case Intrinsic::experimental_vector_reduce_and:
-  case Intrinsic::experimental_vector_reduce_or:
-  case Intrinsic::experimental_vector_reduce_xor:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_umin:
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_umax:
   // Target intrinsics
   case Intrinsic::arm_mve_vctp8:
   case Intrinsic::arm_mve_vctp16:
@@ -1711,31 +1711,31 @@ Constant *ConstantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
       return nullptr;
     const APInt &X = CI->getValue();
     switch (IID) {
-    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::vector_reduce_add:
       Acc = Acc + X;
       break;
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_mul:
       Acc = Acc * X;
       break;
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       Acc = Acc & X;
       break;
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       Acc = Acc | X;
       break;
-    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::vector_reduce_xor:
       Acc = Acc ^ X;
       break;
-    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::vector_reduce_smin:
       Acc = APIntOps::smin(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::vector_reduce_smax:
       Acc = APIntOps::smax(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::vector_reduce_umin:
       Acc = APIntOps::umin(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_umax:
       Acc = APIntOps::umax(Acc, X);
       break;
     }
@@ -2240,15 +2240,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
   if (isa<ConstantAggregateZero>(Operands[0])) {
     switch (IntrinsicID) {
     default: break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_umin:
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
       return ConstantInt::get(Ty, 0);
     }
   }
@@ -2259,15 +2259,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     auto *Op = cast<Constant>(Operands[0]);
     switch (IntrinsicID) {
     default: break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_umin:
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
       if (Constant *C = ConstantFoldVectorReduce(IntrinsicID, Op))
         return C;
       break;

diff  --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
index dfaaafaf811f..184eae51b2ce 100644
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass implements IR expansion for reduction intrinsics, allowing targets
-// to enable the experimental intrinsics until just before codegen.
+// to enable the intrinsics until just before codegen.
 //
 //===----------------------------------------------------------------------===//
 
@@ -30,27 +30,27 @@ namespace {
 
 unsigned getOpcode(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
+  case Intrinsic::vector_reduce_fadd:
     return Instruction::FAdd;
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
+  case Intrinsic::vector_reduce_fmul:
     return Instruction::FMul;
-  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::vector_reduce_add:
     return Instruction::Add;
-  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::vector_reduce_mul:
     return Instruction::Mul;
-  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::vector_reduce_and:
     return Instruction::And;
-  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::vector_reduce_or:
     return Instruction::Or;
-  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::vector_reduce_xor:
     return Instruction::Xor;
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_umax:
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
     return Instruction::ICmp;
-  case Intrinsic::experimental_vector_reduce_fmax:
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
     return Instruction::FCmp;
   default:
     llvm_unreachable("Unexpected ID");
@@ -59,17 +59,17 @@ unsigned getOpcode(Intrinsic::ID ID) {
 
 RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::vector_reduce_smax:
     return RecurrenceDescriptor::MRK_SIntMax;
-  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::vector_reduce_smin:
     return RecurrenceDescriptor::MRK_SIntMin;
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_umax:
     return RecurrenceDescriptor::MRK_UIntMax;
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_umin:
     return RecurrenceDescriptor::MRK_UIntMin;
-  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmax:
     return RecurrenceDescriptor::MRK_FloatMax;
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmin:
     return RecurrenceDescriptor::MRK_FloatMin;
   default:
     return RecurrenceDescriptor::MRK_Invalid;
@@ -83,19 +83,19 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
       default: break;
-      case Intrinsic::experimental_vector_reduce_v2_fadd:
-      case Intrinsic::experimental_vector_reduce_v2_fmul:
-      case Intrinsic::experimental_vector_reduce_add:
-      case Intrinsic::experimental_vector_reduce_mul:
-      case Intrinsic::experimental_vector_reduce_and:
-      case Intrinsic::experimental_vector_reduce_or:
-      case Intrinsic::experimental_vector_reduce_xor:
-      case Intrinsic::experimental_vector_reduce_smax:
-      case Intrinsic::experimental_vector_reduce_smin:
-      case Intrinsic::experimental_vector_reduce_umax:
-      case Intrinsic::experimental_vector_reduce_umin:
-      case Intrinsic::experimental_vector_reduce_fmax:
-      case Intrinsic::experimental_vector_reduce_fmin:
+      case Intrinsic::vector_reduce_fadd:
+      case Intrinsic::vector_reduce_fmul:
+      case Intrinsic::vector_reduce_add:
+      case Intrinsic::vector_reduce_mul:
+      case Intrinsic::vector_reduce_and:
+      case Intrinsic::vector_reduce_or:
+      case Intrinsic::vector_reduce_xor:
+      case Intrinsic::vector_reduce_smax:
+      case Intrinsic::vector_reduce_smin:
+      case Intrinsic::vector_reduce_umax:
+      case Intrinsic::vector_reduce_umin:
+      case Intrinsic::vector_reduce_fmax:
+      case Intrinsic::vector_reduce_fmin:
         if (TTI->shouldExpandReduction(II))
           Worklist.push_back(II);
 
@@ -116,8 +116,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     Builder.setFastMathFlags(FMF);
     switch (ID) {
     default: llvm_unreachable("Unexpected intrinsic!");
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul: {
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul: {
       // FMFs must be attached to the call, otherwise it's an ordered reduction
       // and it can't be handled by generating a shuffle sequence.
       Value *Acc = II->getArgOperand(0);
@@ -135,15 +135,15 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
       }
       break;
     }
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin: {
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin: {
       Value *Vec = II->getArgOperand(0);
       if (!isPowerOf2_32(
               cast<FixedVectorType>(Vec->getType())->getNumElements()))
@@ -152,8 +152,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
       Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
       break;
     }
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin: {
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin: {
       // FIXME: We only expand 'fast' reductions here because the underlying
       //        code in createMinMaxOp() assumes that comparisons use 'fast'
       //        semantics.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a166b790bca6..2f3601137f83 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6762,19 +6762,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     LowerDeoptimizeCall(&I);
     return;
 
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
-  case Intrinsic::experimental_vector_reduce_add:
-  case Intrinsic::experimental_vector_reduce_mul:
-  case Intrinsic::experimental_vector_reduce_and:
-  case Intrinsic::experimental_vector_reduce_or:
-  case Intrinsic::experimental_vector_reduce_xor:
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_umax:
-  case Intrinsic::experimental_vector_reduce_umin:
-  case Intrinsic::experimental_vector_reduce_fmax:
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul:
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
     visitVectorReduce(I, Intrinsic);
     return;
 
@@ -8937,7 +8937,7 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
     SDFlags.copyFMF(*FPMO);
 
   switch (Intrinsic) {
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
+  case Intrinsic::vector_reduce_fadd:
     if (SDFlags.hasAllowReassociation())
       Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
                         DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2, SDFlags),
@@ -8945,7 +8945,7 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
+  case Intrinsic::vector_reduce_fmul:
     if (SDFlags.hasAllowReassociation())
       Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
                         DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2, SDFlags),
@@ -8953,37 +8953,37 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::vector_reduce_add:
     Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::vector_reduce_mul:
     Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::vector_reduce_and:
     Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::vector_reduce_or:
     Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::vector_reduce_xor:
     Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::vector_reduce_smax:
     Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::vector_reduce_smin:
     Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_umax:
     Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_umin:
     Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmax:
     Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmin:
     Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
     break;
   default:

diff  --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d27c1b459149..f5b235a57861 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsX86.h"
@@ -717,18 +718,42 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   }
   case 'e': {
     SmallVector<StringRef, 2> Groups;
-    static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+");
+    static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[a-z][0-9]+");
     if (R.match(Name, &Groups)) {
+      Intrinsic::ID ID;
+      ID = StringSwitch<Intrinsic::ID>(Groups[1])
+               .Case("add", Intrinsic::vector_reduce_add)
+               .Case("mul", Intrinsic::vector_reduce_mul)
+               .Case("and", Intrinsic::vector_reduce_and)
+               .Case("or", Intrinsic::vector_reduce_or)
+               .Case("xor", Intrinsic::vector_reduce_xor)
+               .Case("smax", Intrinsic::vector_reduce_smax)
+               .Case("smin", Intrinsic::vector_reduce_smin)
+               .Case("umax", Intrinsic::vector_reduce_umax)
+               .Case("umin", Intrinsic::vector_reduce_umin)
+               .Case("fmax", Intrinsic::vector_reduce_fmax)
+               .Case("fmin", Intrinsic::vector_reduce_fmin)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic) {
+        rename(F);
+        auto Args = F->getFunctionType()->params();
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, {Args[0]});
+        return true;
+      }
+    }
+    static const Regex R2(
+        "^experimental.vector.reduce.v2.([a-z]+)\\.[fi][0-9]+");
+    Groups.clear();
+    if (R2.match(Name, &Groups)) {
       Intrinsic::ID ID = Intrinsic::not_intrinsic;
       if (Groups[1] == "fadd")
-        ID = Intrinsic::experimental_vector_reduce_v2_fadd;
+        ID = Intrinsic::vector_reduce_fadd;
       if (Groups[1] == "fmul")
-        ID = Intrinsic::experimental_vector_reduce_v2_fmul;
-
+        ID = Intrinsic::vector_reduce_fmul;
       if (ID != Intrinsic::not_intrinsic) {
         rename(F);
         auto Args = F->getFunctionType()->params();
-        Type *Tys[] = {F->getFunctionType()->getReturnType(), Args[1]};
+        Type *Tys[] = {Args[1]};
         NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
         return true;
       }
@@ -3620,28 +3645,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     DefaultCase();
     return;
   }
-  case Intrinsic::experimental_vector_reduce_v2_fmul: {
-    SmallVector<Value *, 2> Args;
-    if (CI->isFast())
-      Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0));
-    else
-      Args.push_back(CI->getOperand(0));
-    Args.push_back(CI->getOperand(1));
-    NewCall = Builder.CreateCall(NewFn, Args);
-    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
-    break;
-  }
-  case Intrinsic::experimental_vector_reduce_v2_fadd: {
-    SmallVector<Value *, 2> Args;
-    if (CI->isFast())
-      Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType()));
-    else
-      Args.push_back(CI->getOperand(0));
-    Args.push_back(CI->getOperand(1));
-    NewCall = Builder.CreateCall(NewFn, Args);
-    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
-    break;
-  }
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:

diff  --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index febfe189df6e..3319f48b42d7 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -325,61 +325,53 @@ static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
 CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Acc->getType(), Src->getType()};
-  auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys);
+  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fadd,
+                                        {Src->getType()});
   return createCallHelper(Decl, Ops, this);
 }
 
 CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Acc->getType(), Src->getType()};
-  auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys);
+  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fmul,
+                                        {Src->getType()});
   return createCallHelper(Decl, Ops, this);
 }
 
 CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_add, Src);
 }
 
 CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_mul, Src);
 }
 
 CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_and, Src);
 }
 
 CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_or, Src);
 }
 
 CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_xor, Src);
 }
 
 CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
-  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
-                     : Intrinsic::experimental_vector_reduce_umax;
+  auto ID =
+      IsSigned ? Intrinsic::vector_reduce_smax : Intrinsic::vector_reduce_umax;
   return getReductionIntrinsic(this, ID, Src);
 }
 
 CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
-  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
-                     : Intrinsic::experimental_vector_reduce_umin;
+  auto ID =
+      IsSigned ? Intrinsic::vector_reduce_smin : Intrinsic::vector_reduce_umin;
   return getReductionIntrinsic(this, ID, Src);
 }
 
 CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
-  auto Rdx = getReductionIntrinsic(
-      this, Intrinsic::experimental_vector_reduce_fmax, Src);
+  auto Rdx = getReductionIntrinsic(this, Intrinsic::vector_reduce_fmax, Src);
   if (NoNaN) {
     FastMathFlags FMF;
     FMF.setNoNaNs();
@@ -389,8 +381,7 @@ CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
 }
 
 CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
-  auto Rdx = getReductionIntrinsic(
-      this, Intrinsic::experimental_vector_reduce_fmin, Src);
+  auto Rdx = getReductionIntrinsic(this, Intrinsic::vector_reduce_fmin, Src);
   if (NoNaN) {
     FastMathFlags FMF;
     FMF.setNoNaNs();

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index be27849bc19f..c585d13d1135 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -219,8 +219,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
   bool shouldExpandReduction(const IntrinsicInst *II) const {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul:
       // We don't have legalization support for ordered FP reductions.
       return !II->getFastMathFlags().allowReassoc();
 

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 4189eba111e3..689b484ad976 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -195,8 +195,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
   bool shouldExpandReduction(const IntrinsicInst *II) const {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul:
       // We don't have legalization support for ordered FP reductions.
       return !II->getFastMathFlags().allowReassoc();
     default:

diff  --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index f01f485859b0..a721a1670582 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -270,7 +270,7 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
       case Intrinsic::uadd_sat:
       case Intrinsic::ssub_sat:
       case Intrinsic::usub_sat:
-      case Intrinsic::experimental_vector_reduce_add:
+      case Intrinsic::vector_reduce_add:
         continue;
       case Intrinsic::fma:
       case Intrinsic::trunc:

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 15bdbd01e71c..00e981686e79 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1824,8 +1824,7 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   }
 
   auto m_AddRdx = [](Value *&Vec) {
-    return m_OneUse(
-        m_Intrinsic<Intrinsic::experimental_vector_reduce_add>(m_Value(Vec)));
+    return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
   };
   Value *V0, *V1;
   if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
@@ -1833,8 +1832,8 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     // Difference of sums is sum of 
diff erences:
     // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
     Value *Sub = Builder.CreateSub(V0, V1);
-    Value *Rdx = Builder.CreateIntrinsic(
-        Intrinsic::experimental_vector_reduce_add, {Sub->getType()}, {Sub});
+    Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add,
+                                         {Sub->getType()}, {Sub});
     return replaceInstUsesWith(I, Rdx);
   }
 
@@ -2280,9 +2279,8 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
     }
 
     auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
-      return m_OneUse(
-          m_Intrinsic<Intrinsic::experimental_vector_reduce_v2_fadd>(
-              m_Value(Sum), m_Value(Vec)));
+      return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(m_Value(Sum),
+                                                                 m_Value(Vec)));
     };
     Value *A0, *A1, *V0, *V1;
     if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
@@ -2290,9 +2288,8 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
       // Difference of sums is sum of 
diff erences:
       // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
       Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
-      Value *Rdx = Builder.CreateIntrinsic(
-          Intrinsic::experimental_vector_reduce_v2_fadd,
-          {A0->getType(), Sub->getType()}, {A0, Sub}, &I);
+      Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
+                                           {Sub->getType()}, {A0, Sub}, &I);
       return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
     }
 

diff  --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index df2144d206d5..f5dde7424482 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2995,7 +2995,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  // Instrument experimental.vector.reduce.or intrinsic.
+  // Instrument vector.reduce.or intrinsic.
   // Valid (non-poisoned) set bits in the operand pull low the
   // corresponding shadow bits.
   void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
@@ -3013,7 +3013,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  // Instrument experimental.vector.reduce.or intrinsic.
+  // Instrument vector.reduce.and intrinsic.
   // Valid (non-poisoned) unset bits in the operand pull down the
   // corresponding shadow bits.
   void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
@@ -3264,15 +3264,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::masked_load:
       handleMaskedLoad(I);
       break;
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       handleVectorReduceAndIntrinsic(I);
       break;
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       handleVectorReduceOrIntrinsic(I);
       break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_mul:
       handleVectorReduceIntrinsic(I);
       break;
     case Intrinsic::x86_sse_stmxcsr:

diff  --git a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
index d42d740dabff..3ddf90f12c2e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll
@@ -2,278 +2,278 @@
 ; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
 
 ; COST-LABEL: add.i8.v8i8
-; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
 ; CODE-LABEL: add.i8.v8i8
 ; CODE:       addv b0, v0.8b
 define i8 @add.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: add.i8.v16i8
-; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
 ; CODE-LABEL: add.i8.v16i8
 ; CODE:       addv b0, v0.16b
 define i8 @add.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: add.i16.v4i16
-; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
 ; CODE-LABEL: add.i16.v4i16
 ; CODE:       addv h0, v0.4h
 define i16 @add.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: add.i16.v8i16
-; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
 ; CODE-LABEL: add.i16.v8i16
 ; CODE:       addv h0, v0.8h
 define i16 @add.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: add.i32.v4i32
-; COST:       Found an estimated cost of 1 for instruction: %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 1 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
 ; CODE-LABEL: add.i32.v4i32
 ; CODE:       addv s0, v0.4s
 define i32 @add.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: umin.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umin.i8.v8i8
 ; CODE:       uminv b0, v0.8b
 define i8 @umin.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umin.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umin.i8.v16i8
 ; CODE:       uminv b0, v0.16b
 define i8 @umin.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umin.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umin.i16.v4i16
 ; CODE:       uminv h0, v0.4h
 define i16 @umin.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umin.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umin.i16.v8i16
 ; CODE:       uminv h0, v0.8h
 define i16 @umin.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umin.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umin.i32.v4i32
 ; CODE:       uminv s0, v0.4s
 define i32 @umin.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: umax.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umax.i8.v8i8
 ; CODE:       umaxv b0, v0.8b
 define i8 @umax.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umax.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umax.i8.v16i8
 ; CODE:       umaxv b0, v0.16b
 define i8 @umax.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: umax.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umax.i16.v4i16
 ; CODE:       umaxv h0, v0.4h
 define i16 @umax.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umax.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umax.i16.v8i16
 ; CODE:       umaxv h0, v0.8h
 define i16 @umax.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: umax.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umax.i32.v4i32
 ; CODE:       umaxv s0, v0.4s
 define i32 @umax.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: smin.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smin.i8.v8i8
 ; CODE:       sminv b0, v0.8b
 define i8 @smin.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smin.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smin.i8.v16i8
 ; CODE:       sminv b0, v0.16b
 define i8 @smin.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smin.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smin.i16.v4i16
 ; CODE:       sminv h0, v0.4h
 define i16 @smin.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smin.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smin.i16.v8i16
 ; CODE:       sminv h0, v0.8h
 define i16 @smin.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smin.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smin.i32.v4i32
 ; CODE:       sminv s0, v0.4s
 define i32 @smin.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: smax.i8.v8i8
-; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smax.i8.v8i8
 ; CODE:       smaxv b0, v0.8b
 define i8 @smax.i8.v8i8(<8 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smax.i8.v16i8
-; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smax.i8.v16i8
 ; CODE:       smaxv b0, v0.16b
 define i8 @smax.i8.v16i8(<16 x i8> %v) {
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %v)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
   ret i8 %r
 }
 
 ; COST-LABEL: smax.i16.v4i16
-; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smax.i16.v4i16
 ; CODE:       smaxv h0, v0.4h
 define i16 @smax.i16.v4i16(<4 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smax.i16.v8i16
-; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smax.i16.v8i16
 ; CODE:       smaxv h0, v0.8h
 define i16 @smax.i16.v8i16(<8 x i16> %v) {
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %v)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
   ret i16 %r
 }
 
 ; COST-LABEL: smax.i32.v4i32
-; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smax.i32.v4i32
 ; CODE:       smaxv s0, v0.4s
 define i32 @smax.i32.v4i32(<4 x i32> %v) {
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %v)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
   ret i32 %r
 }
 
 ; COST-LABEL: fmin.f32.v4f32
-; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmin.f32.v4f32
 ; CODE:       fminnmv s0, v0.4s
 define float @fmin.f32.v4f32(<4 x float> %v) {
-  %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %v)
+  %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v)
   ret float %r
 }
 
 ; COST-LABEL: fmax.f32.v4f32
-; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmax.f32.v4f32
 ; CODE:       fmaxnmv s0, v0.4s
 define float @fmax.f32.v4f32(<4 x float> %v) {
-  %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %v)
+  %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v)
   ret float %r
 }
 
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)

diff  --git a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
index a03f283cd1e4..d71af6d0b6cf 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
@@ -8,155 +8,155 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-RECIP-LABEL: 'reduce_i64'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'reduce_i64'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'reduce_i64'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'reduce_i64'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'reduce_i64'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'reduce_i64'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-RECIP-LABEL: 'reduce_i32'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'reduce_i32'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'reduce_i32'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'reduce_i32'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'reduce_i32'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'reduce_i32'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
index 8b8ef521764f..87123b45502f 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
index a39f2ffaf648..88790472ba38 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
index bb3205ab3336..fbdc56e16297 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
index a5e0e56e0465..bb94dbe2a5b8 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
@@ -5,171 +5,171 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; V8M-NEXT:  Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
index d7ca1536072b..cb5d0fa7ea03 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
@@ -12,279 +12,279 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i16'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i8'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
index 8bcf5da07bda..37a2146d52b4 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
@@ -10,258 +10,258 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1   = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef)
-  %V2   = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef)
-  %V4   = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef)
-  %V8   = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef)
-  %V16  = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef)
-  %V32  = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef)
-  %V64  = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef)
-  %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef)
+  %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
 
-declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1>)
+declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1>)
+declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.and.v128i1(<128 x i1>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
index 41db9ed818e5..4f803c4e107b 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
@@ -11,83 +11,83 @@
 
 define i32 @reduce_f64(i32 %arg) {
 ; SSE-LABEL: 'reduce_f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef)
-  %V2  = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef)
-  %V4  = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef)
-  %V8  = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef)
-  %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef)
+  %V1  = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+  %V2  = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+  %V4  = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+  %V8  = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+  %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
   ret i32 undef
 }
 
 define i32 @reduce_f32(i32 %arg) {
 ; SSE-LABEL: 'reduce_f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef)
-  %V2  = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef)
-  %V4  = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef)
-  %V8  = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef)
-  %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef)
-  %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef)
+  %V1  = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+  %V2  = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+  %V4  = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+  %V8  = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+  %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+  %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
   ret i32 undef
 }
 
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
 
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
index a12cd8e60703..70b0a44313d2 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
@@ -11,83 +11,83 @@
 
 define i32 @reduce_f64(i32 %arg) {
 ; SSE-LABEL: 'reduce_f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef)
-  %V2  = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef)
-  %V4  = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef)
-  %V8  = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef)
-  %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef)
+  %V1  = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+  %V2  = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+  %V4  = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+  %V8  = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+  %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
   ret i32 undef
 }
 
 define i32 @reduce_f32(i32 %arg) {
 ; SSE-LABEL: 'reduce_f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef)
-  %V2  = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef)
-  %V4  = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef)
-  %V8  = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef)
-  %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef)
-  %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef)
+  %V1  = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+  %V2  = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+  %V4  = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+  %V8  = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+  %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+  %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
   ret i32 undef
 }
 
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
 
-declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmin.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
index a4984f5bdcbd..b79d262b80a6 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -10,276 +10,276 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i64'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i64'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
index 97e0847a50a1..065a75c44abf 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
@@ -10,258 +10,258 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1   = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef)
-  %V2   = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef)
-  %V4   = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef)
-  %V8   = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef)
-  %V16  = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef)
-  %V32  = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef)
-  %V64  = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef)
-  %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef)
+  %V1   = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
 
-declare i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1>)
+declare i1 @llvm.vector.reduce.or.v1i1(<1 x i1>)
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.or.v128i1(<128 x i1>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
index 19070d8721fa..cbbad17b137a 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
index 07483dec5b8a..0901d09e0360 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
index b8787b7b7e79..d2a81fe25378 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
index 5a8b49b6c633..4dbdf71eadc0 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
@@ -11,322 +11,322 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'reduce_i64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
index f1e586bd2e9a..15d3e968d4df 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -10,280 +10,280 @@
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1  = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-  %V2  = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-  %V4  = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-  %V8  = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-  %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+  %V1  = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+  %V2  = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+  %V2  = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+  %V4  = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2   = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-  %V4   = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-  %V8   = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-  %V16  = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-  %V32  = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-  %V64  = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-  %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
   ret i32 undef
 }
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i1'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i1'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i1'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V1   = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef)
-  %V2   = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef)
-  %V4   = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef)
-  %V8   = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef)
-  %V16  = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef)
-  %V32  = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef)
-  %V64  = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef)
-  %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef)
+  %V1   = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
   ret i32 undef
 }
 
-declare i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
 
-declare i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1>)
+declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1>)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.xor.v128i1(<128 x i1>)

diff  --git a/llvm/test/Assembler/invalid-vecreduce.ll b/llvm/test/Assembler/invalid-vecreduce.ll
index 301b0a8938ef..4806ea60883f 100644
--- a/llvm/test/Assembler/invalid-vecreduce.ll
+++ b/llvm/test/Assembler/invalid-vecreduce.ll
@@ -1,34 +1,34 @@
 ; RUN: not opt -S < %s 2>&1 | FileCheck %s
 
-; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64
+; CHECK: Intrinsic has incorrect return type!
+; CHECK-NEXT: float (double, <2 x double>)* @llvm.vector.reduce.fadd.f32.f64.v2f64
 define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) {
-  %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+  %res = call float @llvm.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
   ret float %res
 }
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64
+; CHECK-NEXT: double (float, <2 x double>)* @llvm.vector.reduce.fadd.f64.f32.v2f64
 define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) {
-  %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+  %res = call double @llvm.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
   ret double %res
 }
 
-; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64
+; CHECK: Intrinsic has incorrect return type!
+; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.vector.reduce.fadd.v2f64.f64.v2f64
 define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) {
-  %res = call <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+  %res = call <2 x double> @llvm.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
   ret <2 x double> %res
 }
 
 ; CHECK: Intrinsic has incorrect argument type!
-; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64
+; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.vector.reduce.fadd.f64.v2f64.v2f64
 define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) {
-  %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+  %res = call double @llvm.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
   ret double %res
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
-declare <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+declare float @llvm.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+declare double @llvm.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+declare double @llvm.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+declare <2 x double> @llvm.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)

diff  --git a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll
index c91c7bc3953a..124d58956514 100644
--- a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll
+++ b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll
@@ -1,64 +1,130 @@
 ; RUN: opt -S < %s | FileCheck %s
 ; RUN: llvm-dis < %s.bc | FileCheck %s
 
-define float @fadd_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fadd_acc
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
+
+define float @fadd_v2(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fadd_v2
+; CHECK: %res = call float @llvm.vector.reduce.fadd.v4f32(float %acc, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fadd_undef(<4 x float> %in) {
-; CHECK-LABEL: @fadd_undef
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
+define float @fadd_v2_fast(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fadd_v2_fast
+; CHECK: %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %acc, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fadd_fast_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fadd_fast_acc
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
+define float @fmul_v2(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fmul_v2
+; CHECK: %res = call float @llvm.vector.reduce.fmul.v4f32(float %acc, <4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fadd_fast_undef(<4 x float> %in) {
-; CHECK-LABEL: @fadd_fast_undef
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
+define float @fmul_v2_fast(<4 x float> %in, float %acc) {
+; CHECK-LABEL: @fmul_v2_fast
+; CHECK: %res = call fast  float @llvm.vector.reduce.fmul.v4f32(float %acc, <4 x float> %in)
+  %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
   ret float %res
 }
 
-define float @fmul_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fmul_acc
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
+define float @fmin(<4 x float> %in) {
+; CHECK-LABEL: @fmin
+; CHECK: %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %in)
   ret float %res
 }
 
-define float @fmul_undef(<4 x float> %in) {
-; CHECK-LABEL: @fmul_undef
-; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %in)
-  %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
+define float @fmax(<4 x float> %in) {
+; CHECK-LABEL: @fmax
+; CHECK: %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %in)
+  %res = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %in)
   ret float %res
 }
 
-define float @fmul_fast_acc(<4 x float> %in, float %acc) {
-; CHECK-LABEL: @fmul_fast_acc
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
-  ret float %res
+define i32 @and(<4 x i32> %in) {
+; CHECK-LABEL: @and
+; CHECK: %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %in)
+  ret i32 %res
 }
 
-define float @fmul_fast_undef(<4 x float> %in) {
-; CHECK-LABEL: @fmul_fast_undef
-; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
-  %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
-  ret float %res
+define i32 @or(<4 x i32> %in) {
+; CHECK-LABEL: @or
+; CHECK: %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @xor(<4 x i32> %in) {
+; CHECK-LABEL: @xor
+; CHECK: %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @smin(<4 x i32> %in) {
+; CHECK-LABEL: @smin
+; CHECK: %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %in)
+  ret i32 %res
 }
 
-declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
-; CHECK: declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+define i32 @smax(<4 x i32> %in) {
+; CHECK-LABEL: @smax
+; CHECK: %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @umin(<4 x i32> %in) {
+; CHECK-LABEL: @umin
+; CHECK: %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+define i32 @umax(<4 x i32> %in) {
+; CHECK-LABEL: @umax
+; CHECK: %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %in)
+  %res = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %in)
+  ret i32 %res
+}
+
+
+declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
+
+declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
+; CHECK: declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+
+declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+; CHECK: declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+
+declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
+; CHECK: declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+
+
+
+
 
-declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
-; CHECK: declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)

diff  --git a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc
index 18f44f7acd98..2c9af9a521b7 100644
Binary files a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc and b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc 
diff er

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index f1e0f299bd49..c037a35ba379 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,16 +1,16 @@
 ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
 
 define i8 @add_B(<16 x i8>* %arr)  {
 ; CHECK-LABEL: add_B
 ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
   %bin.rdx = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %bin.rdx)
+  %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %bin.rdx)
   ret i8 %r
 }
 
@@ -18,7 +18,7 @@ define i16 @add_H(<8 x i16>* %arr)  {
 ; CHECK-LABEL: add_H
 ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
   %bin.rdx = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
+  %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
   ret i16 %r
 }
 
@@ -26,7 +26,7 @@ define i32 @add_S( <4 x i32>* %arr)  {
 ; CHECK-LABEL: add_S
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
   %bin.rdx = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
+  %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
   ret i32 %r
 }
 
@@ -35,11 +35,11 @@ define i64 @add_D(<2 x i64>* %arr)  {
 ; CHECK-NOT: addv
 ; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
   %bin.rdx = load <2 x i64>, <2 x i64>* %arr
-  %r = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %bin.rdx)
+  %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %bin.rdx)
   ret i64 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) {
 ; CHECK-LABEL: oversized_ADDV_256
@@ -55,16 +55,16 @@ entry:
   %7 = icmp slt <8 x i32> %6, zeroinitializer
   %8 = sub nsw <8 x i32> zeroinitializer, %6
   %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
-  %r = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %9)
+  %r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %9)
   ret i32 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 
 define i32 @oversized_ADDV_512(<16 x i32>* %arr)  {
 ; CHECK-LABEL: oversized_ADDV_512
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
   %bin.rdx = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
+  %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
   ret i32 %r
 }

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index b6f95a7f3bf0..4ff80296c0d9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -2,28 +2,28 @@
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
 
 ; CHECK-LABEL: smax_B
 ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @smax_B(<16 x i8>* nocapture readonly %arr)  {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -31,7 +31,7 @@ define i8 @smax_B(<16 x i8>* nocapture readonly %arr)  {
 ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -39,7 +39,7 @@ define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
 ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @smax_S(<4 x i32> * nocapture readonly %arr)  {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -47,7 +47,7 @@ define i32 @smax_S(<4 x i32> * nocapture readonly %arr)  {
 ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @umax_B(<16 x i8>* nocapture readonly %arr)  {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -55,7 +55,7 @@ define i8 @umax_B(<16 x i8>* nocapture readonly %arr)  {
 ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @umax_H(<8 x i16>* nocapture readonly %arr)  {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -63,7 +63,7 @@ define i16 @umax_H(<8 x i16>* nocapture readonly %arr)  {
 ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -71,7 +71,7 @@ define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
 ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -79,7 +79,7 @@ define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
 ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -87,7 +87,7 @@ define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
 ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -95,7 +95,7 @@ define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
 ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @umin_B(<16 x i8>* nocapture readonly %arr)  {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %arr.load)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
@@ -103,7 +103,7 @@ define i8 @umin_B(<16 x i8>* nocapture readonly %arr)  {
 ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @umin_H(<8 x i16>* nocapture readonly %arr)  {
   %arr.load = load <8 x i16>, <8 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
@@ -111,7 +111,7 @@ define i16 @umin_H(<8 x i16>* nocapture readonly %arr)  {
 ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
   %arr.load = load <4 x i32>, <4 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
@@ -119,7 +119,7 @@ define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
 ; CHECK: fmaxnmv
 define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
   %arr.load  = load <4 x float>, <4 x float>* %arr
-  %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %arr.load)
+  %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arr.load)
   ret float %r
 }
 
@@ -127,22 +127,22 @@ define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
 ; CHECK: fminnmv
 define float @fminnm_S(<4 x float>* nocapture readonly %arr) {
   %arr.load  = load <4 x float>, <4 x float>* %arr
-  %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %arr.load)
+  %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arr.load)
   ret float %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
 
 define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umax_256
 ; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: umaxv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
 
 define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umax_512
@@ -151,22 +151,22 @@ define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
 
 define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umin_256
 ; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uminv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
 
 define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umin_512
@@ -175,22 +175,22 @@ define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
 
 define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smax_256
 ; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: smaxv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
 
 define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smax_512
@@ -199,22 +199,22 @@ define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
 
 define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smin_256
 ; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: sminv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %arr.load)
+  %r = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
 
 define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smin_512
@@ -223,6 +223,6 @@ define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %arr.load)
+  %r = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index 419cafc23186..23eddd4e174e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -141,7 +141,7 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   ret <2 x i64> %tmp4
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 
 define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
 ; CHECK-LABEL: uabdl8h_rdx
@@ -155,11 +155,11 @@ define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
   %abcmp = icmp slt <16 x i16> %ab
diff , zeroinitializer
   %ababs = sub nsw <16 x i16> zeroinitializer, %ab
diff 
   %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %ab
diff 
-  %reduced_v = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %absel)
+  %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
   ret i16 %reduced_v
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
 ; CHECK-LABEL: uabdl4s_rdx
@@ -173,11 +173,11 @@ define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
   %abcmp = icmp slt <8 x i32> %ab
diff , zeroinitializer
   %ababs = sub nsw <8 x i32> zeroinitializer, %ab
diff 
   %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %ab
diff 
-  %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %absel)
+  %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
   ret i32 %reduced_v
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 
 define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
 ; CHECK: uabdl2d_rdx
@@ -191,7 +191,7 @@ define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
   %abcmp = icmp slt <4 x i64> %ab
diff , zeroinitializer
   %ababs = sub nsw <4 x i64> zeroinitializer, %ab
diff 
   %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %ab
diff 
-  %reduced_v = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %absel)
+  %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
   ret i64 %reduced_v
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
index eef89ab6ff39..67435546f971 100644
--- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
@@ -205,7 +205,7 @@ entry:
   ret void
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @test_udot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b) {
 entry:
@@ -218,7 +218,7 @@ entry:
   %4 = load <8 x i8>, <8 x i8>* %3
   %5 = zext <8 x i8> %4 to <8 x i32>
   %6 = mul nuw nsw <8 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
   ret i32 %7
 }
 
@@ -233,11 +233,11 @@ entry:
   %4 = load <8 x i8>, <8 x i8>* %3
   %5 = sext <8 x i8> %4 to <8 x i32>
   %6 = mul nsw <8 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
   ret i32 %7
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 
 define i32 @test_udot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %sum) {
 entry:
@@ -250,7 +250,7 @@ entry:
   %4 = load <16 x i8>, <16 x i8>* %3
   %5 = zext <16 x i8> %4 to <16 x i32>
   %6 = mul nuw nsw <16 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
   %op.extra = add i32 %7, %sum
   ret i32 %op.extra
 }
@@ -265,7 +265,7 @@ entry:
   %0 = bitcast i8* %a1 to <16 x i8>*
   %1 = load <16 x i8>, <16 x i8>* %0
   %2 = zext <16 x i8> %1 to <16 x i32>
-  %3 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %2)
+  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
   ret i32 %3
 }
 
@@ -280,7 +280,7 @@ entry:
   %4 = load <16 x i8>, <16 x i8>* %3
   %5 = sext <16 x i8> %4 to <16 x i32>
   %6 = mul nsw <16 x i32> %5, %2
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
   %op.extra = add nsw i32 %7, %sum
   ret i32 %op.extra
 }
@@ -295,6 +295,6 @@ entry:
   %0 = bitcast i8* %a1 to <16 x i8>*
   %1 = load <16 x i8>, <16 x i8>* %0
   %2 = sext <16 x i8> %1 to <16 x i32>
-  %3 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %2)
+  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
   ret i32 %3
 }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
index 68501a797178..10aca253e4e6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
@@ -29,7 +29,7 @@ define half @fmaxv_v4f16(<4 x half> %a) #0 {
 ; CHECK-LABEL: fmaxv_v4f16:
 ; CHECK: fmaxnmv h0, v0.4h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a)
+  %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
 }
 
@@ -38,7 +38,7 @@ define half @fmaxv_v8f16(<8 x half> %a) #0 {
 ; CHECK-LABEL: fmaxv_v8f16:
 ; CHECK: fmaxnmv h0, v0.8h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %a)
+  %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
 }
 
@@ -49,7 +49,7 @@ define half @fmaxv_v16f16(<16 x half>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x half>, <16 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
   ret half %res
 }
 
@@ -60,7 +60,7 @@ define half @fmaxv_v32f16(<32 x half>* %a) #0 {
 ; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_512-NEXT: ret
   %op = load <32 x half>, <32 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v32f16(<32 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
   ret half %res
 }
 
@@ -71,7 +71,7 @@ define half @fmaxv_v64f16(<64 x half>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x half>, <64 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v64f16(<64 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
   ret half %res
 }
 
@@ -82,7 +82,7 @@ define half @fmaxv_v128f16(<128 x half>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x half>, <128 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmax.v128f16(<128 x half> %op)
+  %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
   ret half %res
 }
 
@@ -91,7 +91,7 @@ define float @fmaxv_v2f32(<2 x float> %a) #0 {
 ; CHECK-LABEL: fmaxv_v2f32:
 ; CHECK: fmaxnmp s0, v0.2s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a)
+  %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
 }
 
@@ -100,7 +100,7 @@ define float @fmaxv_v4f32(<4 x float> %a) #0 {
 ; CHECK-LABEL: fmaxv_v4f32:
 ; CHECK: fmaxnmv s0, v0.4s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a)
+  %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
 }
 
@@ -111,7 +111,7 @@ define float @fmaxv_v8f32(<8 x float>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x float>, <8 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
   ret float %res
 }
 
@@ -122,7 +122,7 @@ define float @fmaxv_v16f32(<16 x float>* %a) #0 {
 ; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_512-NEXT: ret
   %op = load <16 x float>, <16 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
   ret float %res
 }
 
@@ -133,7 +133,7 @@ define float @fmaxv_v32f32(<32 x float>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x float>, <32 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
   ret float %res
 }
 
@@ -144,7 +144,7 @@ define float @fmaxv_v64f32(<64 x float>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x float>, <64 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmax.v64f32(<64 x float> %op)
+  %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
   ret float %res
 }
 
@@ -153,7 +153,7 @@ define double @fmaxv_v1f64(<1 x double> %a) #0 {
 ; CHECK-LABEL: fmaxv_v1f64:
 ; CHECK-NOT: fmax
 ; CHECK: ret
-  %res = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
+  %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %res
 }
 
@@ -162,7 +162,7 @@ define double @fmaxv_v2f64(<2 x double> %a) #0 {
 ; CHECK-LABEL: fmaxv_v2f64:
 ; CHECK: fmaxnmp d0, v0.2d
 ; CHECK-NEXT: ret
-  %res = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a)
+  %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
 }
 
@@ -173,7 +173,7 @@ define double @fmaxv_v4f64(<4 x double>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x double>, <4 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
   ret double %res
 }
 
@@ -184,7 +184,7 @@ define double @fmaxv_v8f64(<8 x double>* %a) #0 {
 ; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_512-NEXT: ret
   %op = load <8 x double>, <8 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
   ret double %res
 }
 
@@ -195,7 +195,7 @@ define double @fmaxv_v16f64(<16 x double>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x double>, <16 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
   ret double %res
 }
 
@@ -206,7 +206,7 @@ define double @fmaxv_v32f64(<32 x double>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x double>, <32 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmax.v32f64(<32 x double> %op)
+  %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
   ret double %res
 }
 
@@ -219,7 +219,7 @@ define half @fminv_v4f16(<4 x half> %a) #0 {
 ; CHECK-LABEL: fminv_v4f16:
 ; CHECK: fminnmv h0, v0.4h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a)
+  %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
 }
 
@@ -228,7 +228,7 @@ define half @fminv_v8f16(<8 x half> %a) #0 {
 ; CHECK-LABEL: fminv_v8f16:
 ; CHECK: fminnmv h0, v0.8h
 ; CHECK-NEXT: ret
-  %res = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %a)
+  %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
 }
 
@@ -239,7 +239,7 @@ define half @fminv_v16f16(<16 x half>* %a) #0 {
 ; VBITS_GE_256-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x half>, <16 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
   ret half %res
 }
 
@@ -250,7 +250,7 @@ define half @fminv_v32f16(<32 x half>* %a) #0 {
 ; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_512-NEXT: ret
   %op = load <32 x half>, <32 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v32f16(<32 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
   ret half %res
 }
 
@@ -261,7 +261,7 @@ define half @fminv_v64f16(<64 x half>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x half>, <64 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v64f16(<64 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
   ret half %res
 }
 
@@ -272,7 +272,7 @@ define half @fminv_v128f16(<128 x half>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x half>, <128 x half>* %a
-  %res = call half @llvm.experimental.vector.reduce.fmin.v128f16(<128 x half> %op)
+  %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
   ret half %res
 }
 
@@ -281,7 +281,7 @@ define float @fminv_v2f32(<2 x float> %a) #0 {
 ; CHECK-LABEL: fminv_v2f32:
 ; CHECK: fminnmp s0, v0.2s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a)
+  %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
 }
 
@@ -290,7 +290,7 @@ define float @fminv_v4f32(<4 x float> %a) #0 {
 ; CHECK-LABEL: fminv_v4f32:
 ; CHECK: fminnmv s0, v0.4s
 ; CHECK: ret
-  %res = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a)
+  %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
 }
 
@@ -301,7 +301,7 @@ define float @fminv_v8f32(<8 x float>* %a) #0 {
 ; VBITS_GE_256-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x float>, <8 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
   ret float %res
 }
 
@@ -312,7 +312,7 @@ define float @fminv_v16f32(<16 x float>* %a) #0 {
 ; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_512-NEXT: ret
   %op = load <16 x float>, <16 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
   ret float %res
 }
 
@@ -323,7 +323,7 @@ define float @fminv_v32f32(<32 x float>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x float>, <32 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
   ret float %res
 }
 
@@ -334,7 +334,7 @@ define float @fminv_v64f32(<64 x float>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x float>, <64 x float>* %a
-  %res = call float @llvm.experimental.vector.reduce.fmin.v64f32(<64 x float> %op)
+  %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
   ret float %res
 }
 
@@ -343,7 +343,7 @@ define double @fminv_v1f64(<1 x double> %a) #0 {
 ; CHECK-LABEL: fminv_v1f64:
 ; CHECK-NOT: fmin
 ; CHECK: ret
-  %res = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a)
+  %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %res
 }
 
@@ -352,7 +352,7 @@ define double @fminv_v2f64(<2 x double> %a) #0 {
 ; CHECK-LABEL: fminv_v2f64:
 ; CHECK: fminnmp d0, v0.2d
 ; CHECK-NEXT: ret
-  %res = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a)
+  %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
 }
 
@@ -363,7 +363,7 @@ define double @fminv_v4f64(<4 x double>* %a) #0 {
 ; VBITS_GE_256-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x double>, <4 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
   ret double %res
 }
 
@@ -374,7 +374,7 @@ define double @fminv_v8f64(<8 x double>* %a) #0 {
 ; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_512-NEXT: ret
   %op = load <8 x double>, <8 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
   ret double %res
 }
 
@@ -385,7 +385,7 @@ define double @fminv_v16f64(<16 x double>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x double>, <16 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
   ret double %res
 }
 
@@ -396,50 +396,50 @@ define double @fminv_v32f64(<32 x double>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x double>, <32 x double>* %a
-  %res = call double @llvm.experimental.vector.reduce.fmin.v32f64(<32 x double> %op)
+  %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
   ret double %res
 }
 
 attributes #0 = { "target-features"="+sve" }
 
-declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v32f16(<32 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v64f16(<64 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v128f16(<128 x half>)
-
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v64f32(<64 x float>)
-
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v32f64(<32 x double>)
-
-declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v32f16(<32 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v64f16(<64 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v128f16(<128 x half>)
-
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v64f32(<64 x float>)
-
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v32f64(<32 x double>)
+declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
+declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>)
+declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>)
+declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>)
+
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>)
+
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>)
+
+declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
+declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>)
+declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>)
+declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>)
+
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
+declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>)
+
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 633fe19efb02..4967f53d1dfe 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -29,7 +29,7 @@ define i8 @uaddv_v8i8(<8 x i8> %a) #0 {
 ; CHECK-LABEL: uaddv_v8i8:
 ; CHECK: addv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -38,7 +38,7 @@ define i8 @uaddv_v16i8(<16 x i8> %a) #0 {
 ; CHECK-LABEL: uaddv_v16i8:
 ; CHECK: addv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -50,7 +50,7 @@ define i8 @uaddv_v32i8(<32 x i8>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -72,7 +72,7 @@ define i8 @uaddv_v64i8(<64 x i8>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -84,7 +84,7 @@ define i8 @uaddv_v128i8(<128 x i8>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -96,7 +96,7 @@ define i8 @uaddv_v256i8(<256 x i8>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.add.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -105,7 +105,7 @@ define i16 @uaddv_v4i16(<4 x i16> %a) #0 {
 ; CHECK-LABEL: uaddv_v4i16:
 ; CHECK: addv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -114,7 +114,7 @@ define i16 @uaddv_v8i16(<8 x i16> %a) #0 {
 ; CHECK-LABEL: uaddv_v8i16:
 ; CHECK: addv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -126,7 +126,7 @@ define i16 @uaddv_v16i16(<16 x i16>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -148,7 +148,7 @@ define i16 @uaddv_v32i16(<32 x i16>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -160,7 +160,7 @@ define i16 @uaddv_v64i16(<64 x i16>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -172,7 +172,7 @@ define i16 @uaddv_v128i16(<128 x i16>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.add.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -181,7 +181,7 @@ define i32 @uaddv_v2i32(<2 x i32> %a) #0 {
 ; CHECK-LABEL: uaddv_v2i32:
 ; CHECK: addp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -190,7 +190,7 @@ define i32 @uaddv_v4i32(<4 x i32> %a) #0 {
 ; CHECK-LABEL: uaddv_v4i32:
 ; CHECK: addv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -202,7 +202,7 @@ define i32 @uaddv_v8i32(<8 x i32>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -224,7 +224,7 @@ define i32 @uaddv_v16i32(<16 x i32>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -236,7 +236,7 @@ define i32 @uaddv_v32i32(<32 x i32>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -248,7 +248,7 @@ define i32 @uaddv_v64i32(<64 x i32>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.add.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -257,7 +257,7 @@ define i64 @uaddv_v1i64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: uaddv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -266,7 +266,7 @@ define i64 @uaddv_v2i64(<2 x i64> %a) #0 {
 ; CHECK-LABEL: uaddv_v2i64:
 ; CHECK: addp d0, v0.2d
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -278,7 +278,7 @@ define i64 @uaddv_v4i64(<4 x i64>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -300,7 +300,7 @@ define i64 @uaddv_v8i64(<8 x i64>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -312,7 +312,7 @@ define i64 @uaddv_v16i64(<16 x i64>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -324,7 +324,7 @@ define i64 @uaddv_v32i64(<32 x i64>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.add.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -337,7 +337,7 @@ define i8 @smaxv_v8i8(<8 x i8> %a) #0 {
 ; CHECK-LABEL: smaxv_v8i8:
 ; CHECK: smaxv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -346,7 +346,7 @@ define i8 @smaxv_v16i8(<16 x i8> %a) #0 {
 ; CHECK-LABEL: smaxv_v16i8:
 ; CHECK: smaxv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -358,7 +358,7 @@ define i8 @smaxv_v32i8(<32 x i8>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -380,7 +380,7 @@ define i8 @smaxv_v64i8(<64 x i8>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -392,7 +392,7 @@ define i8 @smaxv_v128i8(<128 x i8>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -404,7 +404,7 @@ define i8 @smaxv_v256i8(<256 x i8>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smax.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -413,7 +413,7 @@ define i16 @smaxv_v4i16(<4 x i16> %a) #0 {
 ; CHECK-LABEL: smaxv_v4i16:
 ; CHECK: smaxv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -422,7 +422,7 @@ define i16 @smaxv_v8i16(<8 x i16> %a) #0 {
 ; CHECK-LABEL: smaxv_v8i16:
 ; CHECK: smaxv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -434,7 +434,7 @@ define i16 @smaxv_v16i16(<16 x i16>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -456,7 +456,7 @@ define i16 @smaxv_v32i16(<32 x i16>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -468,7 +468,7 @@ define i16 @smaxv_v64i16(<64 x i16>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -480,7 +480,7 @@ define i16 @smaxv_v128i16(<128 x i16>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smax.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -489,7 +489,7 @@ define i32 @smaxv_v2i32(<2 x i32> %a) #0 {
 ; CHECK-LABEL: smaxv_v2i32:
 ; CHECK: smaxp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -498,7 +498,7 @@ define i32 @smaxv_v4i32(<4 x i32> %a) #0 {
 ; CHECK-LABEL: smaxv_v4i32:
 ; CHECK: smaxv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -510,7 +510,7 @@ define i32 @smaxv_v8i32(<8 x i32>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -532,7 +532,7 @@ define i32 @smaxv_v16i32(<16 x i32>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -544,7 +544,7 @@ define i32 @smaxv_v32i32(<32 x i32>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -556,7 +556,7 @@ define i32 @smaxv_v64i32(<64 x i32>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smax.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -565,7 +565,7 @@ define i64 @smaxv_v1i64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: smaxv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -576,7 +576,7 @@ define i64 @smaxv_v2i64(<2 x i64> %a) #0 {
 ; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -588,7 +588,7 @@ define i64 @smaxv_v4i64(<4 x i64>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -610,7 +610,7 @@ define i64 @smaxv_v8i64(<8 x i64>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -622,7 +622,7 @@ define i64 @smaxv_v16i64(<16 x i64>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -634,7 +634,7 @@ define i64 @smaxv_v32i64(<32 x i64>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smax.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -647,7 +647,7 @@ define i8 @sminv_v8i8(<8 x i8> %a) #0 {
 ; CHECK-LABEL: sminv_v8i8:
 ; CHECK: sminv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -656,7 +656,7 @@ define i8 @sminv_v16i8(<16 x i8> %a) #0 {
 ; CHECK-LABEL: sminv_v16i8:
 ; CHECK: sminv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -668,7 +668,7 @@ define i8 @sminv_v32i8(<32 x i8>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -690,7 +690,7 @@ define i8 @sminv_v64i8(<64 x i8>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -702,7 +702,7 @@ define i8 @sminv_v128i8(<128 x i8>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -714,7 +714,7 @@ define i8 @sminv_v256i8(<256 x i8>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.smin.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -723,7 +723,7 @@ define i16 @sminv_v4i16(<4 x i16> %a) #0 {
 ; CHECK-LABEL: sminv_v4i16:
 ; CHECK: sminv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -732,7 +732,7 @@ define i16 @sminv_v8i16(<8 x i16> %a) #0 {
 ; CHECK-LABEL: sminv_v8i16:
 ; CHECK: sminv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -744,7 +744,7 @@ define i16 @sminv_v16i16(<16 x i16>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -766,7 +766,7 @@ define i16 @sminv_v32i16(<32 x i16>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -778,7 +778,7 @@ define i16 @sminv_v64i16(<64 x i16>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -790,7 +790,7 @@ define i16 @sminv_v128i16(<128 x i16>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.smin.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -799,7 +799,7 @@ define i32 @sminv_v2i32(<2 x i32> %a) #0 {
 ; CHECK-LABEL: sminv_v2i32:
 ; CHECK: minp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -808,7 +808,7 @@ define i32 @sminv_v4i32(<4 x i32> %a) #0 {
 ; CHECK-LABEL: sminv_v4i32:
 ; CHECK: sminv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -820,7 +820,7 @@ define i32 @sminv_v8i32(<8 x i32>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -842,7 +842,7 @@ define i32 @sminv_v16i32(<16 x i32>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -854,7 +854,7 @@ define i32 @sminv_v32i32(<32 x i32>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -866,7 +866,7 @@ define i32 @sminv_v64i32(<64 x i32>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.smin.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -875,7 +875,7 @@ define i64 @sminv_v1i64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: sminv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -886,7 +886,7 @@ define i64 @sminv_v2i64(<2 x i64> %a) #0 {
 ; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -898,7 +898,7 @@ define i64 @sminv_v4i64(<4 x i64>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -920,7 +920,7 @@ define i64 @sminv_v8i64(<8 x i64>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -932,7 +932,7 @@ define i64 @sminv_v16i64(<16 x i64>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -944,7 +944,7 @@ define i64 @sminv_v32i64(<32 x i64>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.smin.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -957,7 +957,7 @@ define i8 @umaxv_v8i8(<8 x i8> %a) #0 {
 ; CHECK-LABEL: umaxv_v8i8:
 ; CHECK: umaxv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -966,7 +966,7 @@ define i8 @umaxv_v16i8(<16 x i8> %a) #0 {
 ; CHECK-LABEL: umaxv_v16i8:
 ; CHECK: umaxv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -978,7 +978,7 @@ define i8 @umaxv_v32i8(<32 x i8>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -1000,7 +1000,7 @@ define i8 @umaxv_v64i8(<64 x i8>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -1012,7 +1012,7 @@ define i8 @umaxv_v128i8(<128 x i8>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -1024,7 +1024,7 @@ define i8 @umaxv_v256i8(<256 x i8>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umax.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -1033,7 +1033,7 @@ define i16 @umaxv_v4i16(<4 x i16> %a) #0 {
 ; CHECK-LABEL: umaxv_v4i16:
 ; CHECK: umaxv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -1042,7 +1042,7 @@ define i16 @umaxv_v8i16(<8 x i16> %a) #0 {
 ; CHECK-LABEL: umaxv_v8i16:
 ; CHECK: umaxv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -1054,7 +1054,7 @@ define i16 @umaxv_v16i16(<16 x i16>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -1076,7 +1076,7 @@ define i16 @umaxv_v32i16(<32 x i16>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -1088,7 +1088,7 @@ define i16 @umaxv_v64i16(<64 x i16>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -1100,7 +1100,7 @@ define i16 @umaxv_v128i16(<128 x i16>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umax.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -1109,7 +1109,7 @@ define i32 @umaxv_v2i32(<2 x i32> %a) #0 {
 ; CHECK-LABEL: umaxv_v2i32:
 ; CHECK: umaxp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -1118,7 +1118,7 @@ define i32 @umaxv_v4i32(<4 x i32> %a) #0 {
 ; CHECK-LABEL: umaxv_v4i32:
 ; CHECK: umaxv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -1130,7 +1130,7 @@ define i32 @umaxv_v8i32(<8 x i32>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -1152,7 +1152,7 @@ define i32 @umaxv_v16i32(<16 x i32>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -1164,7 +1164,7 @@ define i32 @umaxv_v32i32(<32 x i32>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -1176,7 +1176,7 @@ define i32 @umaxv_v64i32(<64 x i32>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umax.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -1185,7 +1185,7 @@ define i64 @umaxv_v1i64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: umaxv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -1196,7 +1196,7 @@ define i64 @umaxv_v2i64(<2 x i64> %a) #0 {
 ; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -1208,7 +1208,7 @@ define i64 @umaxv_v4i64(<4 x i64>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -1230,7 +1230,7 @@ define i64 @umaxv_v8i64(<8 x i64>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -1242,7 +1242,7 @@ define i64 @umaxv_v16i64(<16 x i64>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -1254,7 +1254,7 @@ define i64 @umaxv_v32i64(<32 x i64>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umax.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
@@ -1267,7 +1267,7 @@ define i8 @uminv_v8i8(<8 x i8> %a) #0 {
 ; CHECK-LABEL: uminv_v8i8:
 ; CHECK: uminv b0, v0.8b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
 
@@ -1276,7 +1276,7 @@ define i8 @uminv_v16i8(<16 x i8> %a) #0 {
 ; CHECK-LABEL: uminv_v16i8:
 ; CHECK: uminv b0, v0.16b
 ; CHECK: ret
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a)
+  %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
 
@@ -1288,7 +1288,7 @@ define i8 @uminv_v32i8(<32 x i8>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <32 x i8>, <32 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
   ret i8 %res
 }
 
@@ -1310,7 +1310,7 @@ define i8 @uminv_v64i8(<64 x i8>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <64 x i8>, <64 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op)
   ret i8 %res
 }
 
@@ -1322,7 +1322,7 @@ define i8 @uminv_v128i8(<128 x i8>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <128 x i8>, <128 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op)
   ret i8 %res
 }
 
@@ -1334,7 +1334,7 @@ define i8 @uminv_v256i8(<256 x i8>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <256 x i8>, <256 x i8>* %a
-  %res = call i8 @llvm.experimental.vector.reduce.umin.v256i8(<256 x i8> %op)
+  %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op)
   ret i8 %res
 }
 
@@ -1343,7 +1343,7 @@ define i16 @uminv_v4i16(<4 x i16> %a) #0 {
 ; CHECK-LABEL: uminv_v4i16:
 ; CHECK: uminv h0, v0.4h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
 
@@ -1352,7 +1352,7 @@ define i16 @uminv_v8i16(<8 x i16> %a) #0 {
 ; CHECK-LABEL: uminv_v8i16:
 ; CHECK: uminv h0, v0.8h
 ; CHECK: ret
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a)
+  %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
 
@@ -1364,7 +1364,7 @@ define i16 @uminv_v16i16(<16 x i16>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <16 x i16>, <16 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
   ret i16 %res
 }
 
@@ -1386,7 +1386,7 @@ define i16 @uminv_v32i16(<32 x i16>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <32 x i16>, <32 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op)
   ret i16 %res
 }
 
@@ -1398,7 +1398,7 @@ define i16 @uminv_v64i16(<64 x i16>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <64 x i16>, <64 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op)
   ret i16 %res
 }
 
@@ -1410,7 +1410,7 @@ define i16 @uminv_v128i16(<128 x i16>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <128 x i16>, <128 x i16>* %a
-  %res = call i16 @llvm.experimental.vector.reduce.umin.v128i16(<128 x i16> %op)
+  %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op)
   ret i16 %res
 }
 
@@ -1419,7 +1419,7 @@ define i32 @uminv_v2i32(<2 x i32> %a) #0 {
 ; CHECK-LABEL: uminv_v2i32:
 ; CHECK: minp v0.2s, v0.2s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
 
@@ -1428,7 +1428,7 @@ define i32 @uminv_v4i32(<4 x i32> %a) #0 {
 ; CHECK-LABEL: uminv_v4i32:
 ; CHECK: uminv s0, v0.4s
 ; CHECK: ret
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a)
+  %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
 
@@ -1440,7 +1440,7 @@ define i32 @uminv_v8i32(<8 x i32>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <8 x i32>, <8 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
   ret i32 %res
 }
 
@@ -1462,7 +1462,7 @@ define i32 @uminv_v16i32(<16 x i32>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <16 x i32>, <16 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op)
   ret i32 %res
 }
 
@@ -1474,7 +1474,7 @@ define i32 @uminv_v32i32(<32 x i32>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <32 x i32>, <32 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op)
   ret i32 %res
 }
 
@@ -1486,7 +1486,7 @@ define i32 @uminv_v64i32(<64 x i32>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <64 x i32>, <64 x i32>* %a
-  %res = call i32 @llvm.experimental.vector.reduce.umin.v64i32(<64 x i32> %op)
+  %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op)
   ret i32 %res
 }
 
@@ -1495,7 +1495,7 @@ define i64 @uminv_v1i64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: uminv_v1i64:
 ; CHECK: fmov x0, d0
 ; CHECK: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a)
   ret i64 %res
 }
 
@@ -1506,7 +1506,7 @@ define i64 @uminv_v2i64(<2 x i64> %a) #0 {
 ; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
 ; CHECK-NEXT: fmov x0, [[REDUCE]]
 ; CHECK-NEXT: ret
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a)
+  %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
 
@@ -1518,7 +1518,7 @@ define i64 @uminv_v4i64(<4 x i64>* %a) #0 {
 ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_256-NEXT: ret
   %op = load <4 x i64>, <4 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
   ret i64 %res
 }
 
@@ -1540,7 +1540,7 @@ define i64 @uminv_v8i64(<8 x i64>* %a) #0 {
 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_EQ_256-NEXT: ret
   %op = load <8 x i64>, <8 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op)
   ret i64 %res
 }
 
@@ -1552,7 +1552,7 @@ define i64 @uminv_v16i64(<16 x i64>* %a) #0 {
 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_1024-NEXT: ret
   %op = load <16 x i64>, <16 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op)
   ret i64 %res
 }
 
@@ -1564,148 +1564,148 @@ define i64 @uminv_v32i64(<32 x i64>* %a) #0 {
 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
 ; VBITS_GE_2048-NEXT: ret
   %op = load <32 x i64>, <32 x i64>* %a
-  %res = call i64 @llvm.experimental.vector.reduce.umin.v32i64(<32 x i64> %op)
+  %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op)
   ret i64 %res
 }
 
 attributes #0 = { "target-features"="+sve" }
 
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v32i64(<32 x i64>)
-
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v256i8(<256 x i8>)
-
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v128i16(<128 x i16>)
-
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v64i32(<64 x i32>)
-
-declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v32i64(<32 x i64>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
+
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
+
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
+
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
+
+declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll
index 2f899878363a..36ec218e4e20 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll
@@ -1,28 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.add.v1i1(<1 x i1> %a)
-declare i8 @llvm.experimental.vector.reduce.add.v1i8(<1 x i8> %a)
-declare i16 @llvm.experimental.vector.reduce.add.v1i16(<1 x i16> %a)
-declare i24 @llvm.experimental.vector.reduce.add.v1i24(<1 x i24> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a)
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a)
-declare i128 @llvm.experimental.vector.reduce.add.v1i128(<1 x i128> %a)
-
-declare i8 @llvm.experimental.vector.reduce.add.v3i8(<3 x i8> %a)
-declare i8 @llvm.experimental.vector.reduce.add.v9i8(<9 x i8> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> %a)
-declare i1 @llvm.experimental.vector.reduce.add.v4i1(<4 x i1> %a)
-declare i24 @llvm.experimental.vector.reduce.add.v4i24(<4 x i24> %a)
-declare i128 @llvm.experimental.vector.reduce.add.v2i128(<2 x i128> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a)
+declare i1 @llvm.vector.reduce.add.v1i1(<1 x i1> %a)
+declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a)
+declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a)
+declare i24 @llvm.vector.reduce.add.v1i24(<1 x i24> %a)
+declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
+declare i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
+
+declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a)
+declare i8 @llvm.vector.reduce.add.v9i8(<9 x i8> %a)
+declare i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %a)
+declare i1 @llvm.vector.reduce.add.v4i1(<4 x i1> %a)
+declare i24 @llvm.vector.reduce.add.v4i24(<4 x i24> %a)
+declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
 
 define i1 @test_v1i1(<1 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v1i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.add.v1i1(<1 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> %a)
   ret i1 %b
 }
 
@@ -32,7 +32,7 @@ define i8 @test_v1i8(<1 x i8> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.b[0]
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.add.v1i8(<1 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a)
   ret i8 %b
 }
 
@@ -42,7 +42,7 @@ define i16 @test_v1i16(<1 x i16> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
-  %b = call i16 @llvm.experimental.vector.reduce.add.v1i16(<1 x i16> %a)
+  %b = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a)
   ret i16 %b
 }
 
@@ -50,7 +50,7 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind {
 ; CHECK-LABEL: test_v1i24:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.add.v1i24(<1 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.add.v1i24(<1 x i24> %a)
   ret i24 %b
 }
 
@@ -60,7 +60,7 @@ define i32 @test_v1i32(<1 x i32> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
   ret i32 %b
 }
 
@@ -70,7 +70,7 @@ define i64 @test_v1i64(<1 x i64> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-  %b = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
   ret i64 %b
 }
 
@@ -78,7 +78,7 @@ define i128 @test_v1i128(<1 x i128> %a) nounwind {
 ; CHECK-LABEL: test_v1i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.add.v1i128(<1 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
   ret i128 %b
 }
 
@@ -92,7 +92,7 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
 ; CHECK-NEXT:    addv h0, v0.4h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.add.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a)
   ret i8 %b
 }
 
@@ -109,7 +109,7 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 ; CHECK-NEXT:    addv b0, v0.16b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.add.v9i8(<9 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.add.v9i8(<9 x i8> %a)
   ret i8 %b
 }
 
@@ -120,7 +120,7 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %a)
   ret i32 %b
 }
 
@@ -131,7 +131,7 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.add.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -141,7 +141,7 @@ define i24 @test_v4i24(<4 x i24> %a) nounwind {
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.add.v4i24(<4 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.add.v4i24(<4 x i24> %a)
   ret i24 %b
 }
 
@@ -151,7 +151,7 @@ define i128 @test_v2i128(<2 x i128> %a) nounwind {
 ; CHECK-NEXT:    adds x0, x0, x2
 ; CHECK-NEXT:    adcs x1, x1, x3
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.add.v2i128(<2 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
   ret i128 %b
 }
 
@@ -164,6 +164,6 @@ define i32 @test_v16i32(<16 x i32> %a) nounwind {
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
   ret i32 %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index 43b6f5290d1a..99d55b33da35 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -1,28 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a)
-declare i8 @llvm.experimental.vector.reduce.and.v1i8(<1 x i8> %a)
-declare i16 @llvm.experimental.vector.reduce.and.v1i16(<1 x i16> %a)
-declare i24 @llvm.experimental.vector.reduce.and.v1i24(<1 x i24> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a)
-declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %a)
-declare i128 @llvm.experimental.vector.reduce.and.v1i128(<1 x i128> %a)
-
-declare i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a)
-declare i8 @llvm.experimental.vector.reduce.and.v9i8(<9 x i8> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
-declare i24 @llvm.experimental.vector.reduce.and.v4i24(<4 x i24> %a)
-declare i128 @llvm.experimental.vector.reduce.and.v2i128(<2 x i128> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a)
+declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a)
+declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %a)
+declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %a)
+declare i24 @llvm.vector.reduce.and.v1i24(<1 x i24> %a)
+declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
+declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
+declare i128 @llvm.vector.reduce.and.v1i128(<1 x i128> %a)
+
+declare i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a)
+declare i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a)
+declare i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
+declare i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a)
+declare i128 @llvm.vector.reduce.and.v2i128(<2 x i128> %a)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a)
 
 define i1 @test_v1i1(<1 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v1i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a)
   ret i1 %b
 }
 
@@ -32,7 +32,7 @@ define i8 @test_v1i8(<1 x i8> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.b[0]
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.and.v1i8(<1 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %a)
   ret i8 %b
 }
 
@@ -42,7 +42,7 @@ define i16 @test_v1i16(<1 x i16> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
-  %b = call i16 @llvm.experimental.vector.reduce.and.v1i16(<1 x i16> %a)
+  %b = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %a)
   ret i16 %b
 }
 
@@ -50,7 +50,7 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind {
 ; CHECK-LABEL: test_v1i24:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.and.v1i24(<1 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.and.v1i24(<1 x i24> %a)
   ret i24 %b
 }
 
@@ -60,7 +60,7 @@ define i32 @test_v1i32(<1 x i32> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
   ret i32 %b
 }
 
@@ -70,7 +70,7 @@ define i64 @test_v1i64(<1 x i64> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-  %b = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
   ret i64 %b
 }
 
@@ -78,7 +78,7 @@ define i128 @test_v1i128(<1 x i128> %a) nounwind {
 ; CHECK-LABEL: test_v1i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.and.v1i128(<1 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.and.v1i128(<1 x i128> %a)
   ret i128 %b
 }
 
@@ -89,7 +89,7 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
 ; CHECK-NEXT:    and w8, w8, w2
 ; CHECK-NEXT:    and w0, w8, #0xff
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a)
   ret i8 %b
 }
 
@@ -120,7 +120,7 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 ; CHECK-NEXT:    umov w9, v0.b[7]
 ; CHECK-NEXT:    and w0, w8, w9
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.and.v9i8(<9 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a)
   ret i8 %b
 }
 
@@ -133,7 +133,7 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
 ; CHECK-NEXT:    fmov w9, s1
 ; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
   ret i32 %b
 }
 
@@ -150,7 +150,7 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-NEXT:    and w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -163,7 +163,7 @@ define i24 @test_v4i24(<4 x i24> %a) nounwind {
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.and.v4i24(<4 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a)
   ret i24 %b
 }
 
@@ -173,7 +173,7 @@ define i128 @test_v2i128(<2 x i128> %a) nounwind {
 ; CHECK-NEXT:    and x0, x0, x2
 ; CHECK-NEXT:    and x1, x1, x3
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.and.v2i128(<2 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.and.v2i128(<2 x i128> %a)
   ret i128 %b
 }
 
@@ -189,6 +189,6 @@ define i32 @test_v16i32(<16 x i32> %a) nounwind {
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a)
   ret i32 %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index e404159c7630..1b5692c01332 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
+declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
 
-declare i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
-declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
+declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
 
 define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_and_v1:
@@ -24,7 +24,7 @@ define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, lt
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -41,7 +41,7 @@ define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <2 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -58,7 +58,7 @@ define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <4 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -73,7 +73,7 @@ define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <8 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -88,7 +88,7 @@ define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <16 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -105,7 +105,7 @@ define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <32 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -119,7 +119,7 @@ define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, lt
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -136,7 +136,7 @@ define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <2 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -153,7 +153,7 @@ define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <4 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -168,7 +168,7 @@ define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <8 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -183,7 +183,7 @@ define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <16 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
@@ -200,7 +200,7 @@ define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <32 x i8> %a0, zeroinitializer
-  %y = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %x)
+  %y = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %x)
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
index 5d6f2e40d4d5..12c166490e87 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
@@ -3,14 +3,14 @@
 
 ; Same as vecreduce-fadd-legalization.ll, but without fmf.
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -20,7 +20,7 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    ret
-  %b = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -31,7 +31,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:    fmov s1, wzr
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -41,7 +41,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-NEXT:    fmov d1, xzr
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-  %b = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -54,7 +54,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -68,7 +68,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-NEXT:    mov s0, v0.s[2]
 ; CHECK-NEXT:    fadd s0, s1, s0
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -86,7 +86,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32 // =32
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -123,6 +123,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    mov s1, v3.s[3]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
index 11686148421d..e098c08ce46c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call fast nnan half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call fast nnan double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -40,7 +40,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fast nnan fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -53,7 +53,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -64,7 +64,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fast nnan fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -78,6 +78,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
-  %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 90367377fb4a..7c5e50a6ef11 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -14,7 +14,7 @@ define float @add_HalfS(<2 x float> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECKNOFP16-NEXT:    faddp s0, v0.2s
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx)
   ret float %r
 }
 
@@ -48,7 +48,7 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    fadd s0, s0, s1
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx)
+  %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx)
   ret half %r
 }
 
@@ -103,7 +103,7 @@ define half @add_H(<8 x half> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    fadd s0, s0, s1
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx)
+  %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx)
   ret half %r
 }
 
@@ -121,7 +121,7 @@ define float @add_S(<4 x float> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECKNOFP16-NEXT:    faddp s0, v0.2s
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
@@ -135,7 +135,7 @@ define double @add_D(<2 x double> %bin.rdx)  {
 ; CHECKNOFP16:       // %bb.0:
 ; CHECKNOFP16-NEXT:    faddp d0, v0.2d
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx)
+  %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx)
   ret double %r
 }
 
@@ -229,7 +229,7 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    fadd s0, s1, s0
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx)
+  %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx)
   ret half %r
 }
 
@@ -249,7 +249,7 @@ define float @add_2S(<8 x float> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    fadd v0.2s, v0.2s, v1.2s
 ; CHECKNOFP16-NEXT:    faddp s0, v0.2s
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx)
   ret float %r
 }
 
@@ -265,16 +265,16 @@ define double @add_2D(<4 x double> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    fadd v0.2d, v0.2d, v1.2d
 ; CHECKNOFP16-NEXT:    faddp d0, v0.2d
 ; CHECKNOFP16-NEXT:    ret
-  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx)
+  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx)
   ret double %r
 }
 
 ; Function Attrs: nounwind readnone
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index 514a43a5e171..8d952a443c71 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
+declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
 
-declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
+declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
+  %b = call half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
+  %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
+  %b = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %b
 }
 
@@ -40,14 +40,14 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
   ret fp128 %b
 }
 
 ; TODO: This doesn't work, because ExpandReductions only supports power of two
 ; unordered reductions.
 ;define float @test_v3f32(<3 x float> %a) nounwind {
-;  %b = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
+;  %b = call float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
 ;  ret float %b
 ;}
 
@@ -55,7 +55,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    b fmaxl
-  %b = call fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -67,6 +67,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+  %b = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 89cee4f0a060..f1ebd8fa85ea 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
+declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
 
-declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
+declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
+  %b = call nnan half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
+  %b = call nnan double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %b
 }
 
@@ -40,7 +40,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -52,7 +52,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -64,7 +64,7 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind {
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan ninf float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
+  %b = call nnan ninf float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -72,7 +72,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    b fmaxl
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -84,6 +84,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index bb2d6b75bcd2..4129fa80b13e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a)
-declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a)
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a)
+declare half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a)
+declare float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
+declare fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a)
 
-declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a)
-declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a)
+declare float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
+declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a)
+  %b = call nnan half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a)
   ret half %b
 }
 
@@ -24,7 +24,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
   ret float %b
 }
 
@@ -32,7 +32,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-LABEL: test_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a)
+  %b = call nnan double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %b
 }
 
@@ -40,7 +40,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -52,7 +52,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fminnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -64,7 +64,7 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind {
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    fminnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan ninf float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a)
+  %b = call nnan ninf float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
   ret float %b
 }
 
@@ -72,7 +72,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    b fminl
-  %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a)
+  %b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -84,6 +84,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    fminnm v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    fminnmv s0, v0.4s
 ; CHECK-NEXT:    ret
-  %b = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a)
+  %b = call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
index 41914ca73afa..be661127f066 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
@@ -3,14 +3,14 @@
 
 ; Same as vecreduce-fmul-legalization.ll, but without fmf.
 
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fmul.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -20,7 +20,7 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-NEXT:    fmul s0, s0, s1
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    ret
-  %b = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fmul.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -31,7 +31,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:    fmov s1, wzr
 ; CHECK-NEXT:    fmul s0, s1, v0.s[0]
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -41,7 +41,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-NEXT:    fmov d1, xzr
 ; CHECK-NEXT:    fmul d0, d0, d1
 ; CHECK-NEXT:    ret
-  %b = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fmul.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -54,7 +54,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-NEXT:    bl __multf3
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -66,7 +66,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-NEXT:    fmul s1, s1, v0.s[1]
 ; CHECK-NEXT:    fmul s0, s1, v0.s[2]
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -84,7 +84,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32 // =32
 ; CHECK-NEXT:    ret
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -109,6 +109,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    fmul s0, s0, v3.s[2]
 ; CHECK-NEXT:    fmul s0, s0, v3.s[3]
 ; CHECK-NEXT:    ret
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
index cdb557d1a394..5675fd7e59c9 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
@@ -24,8 +24,8 @@ entry:
  %1 = insertelement <4 x double> %0, double 1.0, i32 1
  %2 = insertelement <4 x double> %1, double 1.0, i32 2
  %3 = insertelement <4 x double> %2, double 1.0, i32 3
- %4 = call nnan reassoc double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %3)
+ %4 = call nnan reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> %3)
  ret double %4
 }
 
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 7c9415211681..811745a1ee41 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -1,29 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare i1 @llvm.experimental.vector.reduce.umax.v1i1(<1 x i1> %a)
-declare i8 @llvm.experimental.vector.reduce.umax.v1i8(<1 x i8> %a)
-declare i16 @llvm.experimental.vector.reduce.umax.v1i16(<1 x i16> %a)
-declare i24 @llvm.experimental.vector.reduce.umax.v1i24(<1 x i24> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a)
-declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a)
-declare i128 @llvm.experimental.vector.reduce.umax.v1i128(<1 x i128> %a)
-
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a)
-declare i8 @llvm.experimental.vector.reduce.umax.v3i8(<3 x i8> %a)
-declare i8 @llvm.experimental.vector.reduce.umax.v9i8(<9 x i8> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v3i32(<3 x i32> %a)
-declare i1 @llvm.experimental.vector.reduce.umax.v4i1(<4 x i1> %a)
-declare i24 @llvm.experimental.vector.reduce.umax.v4i24(<4 x i24> %a)
-declare i128 @llvm.experimental.vector.reduce.umax.v2i128(<2 x i128> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a)
+declare i1 @llvm.vector.reduce.umax.v1i1(<1 x i1> %a)
+declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %a)
+declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %a)
+declare i24 @llvm.vector.reduce.umax.v1i24(<1 x i24> %a)
+declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
+declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
+declare i128 @llvm.vector.reduce.umax.v1i128(<1 x i128> %a)
+
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
+declare i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %a)
+declare i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a)
+declare i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
+declare i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a)
+declare i24 @llvm.vector.reduce.umax.v4i24(<4 x i24> %a)
+declare i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a)
 
 define i1 @test_v1i1(<1 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v1i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.umax.v1i1(<1 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.umax.v1i1(<1 x i1> %a)
   ret i1 %b
 }
 
@@ -33,7 +33,7 @@ define i8 @test_v1i8(<1 x i8> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.b[0]
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.umax.v1i8(<1 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %a)
   ret i8 %b
 }
 
@@ -43,7 +43,7 @@ define i16 @test_v1i16(<1 x i16> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
-  %b = call i16 @llvm.experimental.vector.reduce.umax.v1i16(<1 x i16> %a)
+  %b = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %a)
   ret i16 %b
 }
 
@@ -51,7 +51,7 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind {
 ; CHECK-LABEL: test_v1i24:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.umax.v1i24(<1 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.umax.v1i24(<1 x i24> %a)
   ret i24 %b
 }
 
@@ -61,7 +61,7 @@ define i32 @test_v1i32(<1 x i32> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
   ret i32 %b
 }
 
@@ -71,7 +71,7 @@ define i64 @test_v1i64(<1 x i64> %a) nounwind {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-  %b = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
   ret i64 %b
 }
 
@@ -79,7 +79,7 @@ define i128 @test_v1i128(<1 x i128> %a) nounwind {
 ; CHECK-LABEL: test_v1i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.umax.v1i128(<1 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.umax.v1i128(<1 x i128> %a)
   ret i128 %b
 }
 
@@ -92,7 +92,7 @@ define i64 @test_v2i64(<2 x i64> %a) nounwind {
 ; CHECK-NEXT:   cmp     x9, x8
 ; CHECK-NEXT:   csel    x0, x9, x8, hi
 ; CHECK-NEXT:   ret
-  %b = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a)
+  %b = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %b
 }
 
@@ -107,7 +107,7 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
 ; CHECK-NEXT:    umaxv h0, v0.4h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.umax.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %a)
   ret i8 %b
 }
 
@@ -124,7 +124,7 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 ; CHECK-NEXT:    umaxv b0, v0.16b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i8 @llvm.experimental.vector.reduce.umax.v9i8(<9 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a)
   ret i8 %b
 }
 
@@ -135,7 +135,7 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.umax.v3i32(<3 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %b
 }
 
@@ -148,7 +148,7 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-  %b = call i1 @llvm.experimental.vector.reduce.umax.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -159,7 +159,7 @@ define i24 @test_v4i24(<4 x i24> %a) nounwind {
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i24 @llvm.experimental.vector.reduce.umax.v4i24(<4 x i24> %a)
+  %b = call i24 @llvm.vector.reduce.umax.v4i24(<4 x i24> %a)
   ret i24 %b
 }
 
@@ -173,7 +173,7 @@ define i128 @test_v2i128(<2 x i128> %a) nounwind {
 ; CHECK-NEXT:    csel x0, x8, x9, eq
 ; CHECK-NEXT:    csel x1, x1, x3, hi
 ; CHECK-NEXT:    ret
-  %b = call i128 @llvm.experimental.vector.reduce.umax.v2i128(<2 x i128> %a)
+  %b = call i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a)
   ret i128 %b
 }
 
@@ -186,6 +186,6 @@ define i32 @test_v16i32(<16 x i32> %a) nounwind {
 ; CHECK-NEXT:    umaxv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-  %b = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a)
+  %b = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a)
   ret i32 %b
 }

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll
index aaa376a0ba6e..9e3d7e7775b4 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@ define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half 0.0, <4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@ define float @test_v4f32(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_fadd
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@ define double @test_v2f64(<2 x double> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_dadd
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double zeroinitializer, <2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double zeroinitializer, <2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
index 39673bb2d786..83073b895b2c 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -28,7 +28,7 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI0_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -44,7 +44,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI1_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -56,7 +56,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-NEXT:    vadd.f64 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
-  %b = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -76,7 +76,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -95,7 +95,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI4_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -124,7 +124,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -162,6 +162,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI6_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
index 586a02b92bf3..48968ee7ba77 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128>)
+declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@ define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@ define float @test_v4f32(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl fmaxf
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@ define double @test_v2f64(<2 x double> %a) nounwind {
 ; CHECK-NEXT:    bl fmax
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
index b64e4473981b..1252085ef948 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128>)
+declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@ define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@ define float @test_v4f32(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl fminf
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@ define double @test_v2f64(<2 x double> %a) nounwind {
 ; CHECK-NEXT:    bl fmin
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
   ret fp128 %b
 }

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll
index 62111e5f0f34..4217eab6ba70 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>)
+declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>)
 
 define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-LABEL: test_v4f16:
@@ -37,7 +37,7 @@ define half @test_v4f16(<4 x half> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_f2h
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half 1.0, <4 x half> %a)
+  %b = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %a)
   ret half %b
 }
 
@@ -55,7 +55,7 @@ define float @test_v4f32(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_fmul
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a)
+  %b = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
   ret float %b
 }
 
@@ -67,7 +67,7 @@ define double @test_v2f64(<2 x double> %a) nounwind {
 ; CHECK-NEXT:    bl __aeabi_dmul
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a)
+  %b = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a)
   ret double %b
 }
 
@@ -90,6 +90,6 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fast fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a)
+  %b = call fast fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a)
   ret fp128 %b
 }

diff  --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
index 003b64be09a6..b3d23addad41 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half, <1 x half>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float, <1 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128, <1 x fp128>)
+declare half @llvm.vector.reduce.fmul.f16.v1f16(half, <1 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v1f32(float, <1 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128, <1 x fp128>)
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float, <3 x float>)
-declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v3f32(float, <3 x float>)
+declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
 define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-LABEL: test_v1f16:
@@ -28,7 +28,7 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI0_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half 0.0, <1 x half> %a)
+  %b = call half @llvm.vector.reduce.fmul.f16.v1f16(half 0.0, <1 x half> %a)
   ret half %b
 }
 
@@ -44,7 +44,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI1_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float 0.0, <1 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v1f32(float 0.0, <1 x float> %a)
   ret float %b
 }
 
@@ -56,7 +56,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
 ; CHECK-NEXT:    vmul.f64 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
-  %b = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double 0.0, <1 x double> %a)
+  %b = call double @llvm.vector.reduce.fmul.f64.v1f64(double 0.0, <1 x double> %a)
   ret double %b
 }
 
@@ -76,7 +76,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
   ret fp128 %b
 }
 
@@ -95,7 +95,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI4_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float 0.0, <3 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v3f32(float 0.0, <3 x float> %a)
   ret float %b
 }
 
@@ -124,7 +124,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
+  %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
   ret fp128 %b
 }
 
@@ -162,6 +162,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI6_0:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
-  %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 0.0, <16 x float> %a)
+  %b = call float @llvm.vector.reduce.fmul.f32.v16f32(float 0.0, <16 x float> %a)
   ret float %b
 }

diff  --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
index e0e3149e3511..99aa181bd70f 100644
--- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
+++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
@@ -1,24 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -expand-reductions -S | FileCheck %s
 ; Tests without a target which should expand all reductions
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
 
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
 
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
 
-declare i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8>)
+declare i8 @llvm.vector.reduce.and.i8.v3i8(<3 x i8>)
 
 define i64 @add_i64(<2 x i64> %vec) {
 ; CHECK-LABEL: @add_i64(
@@ -29,7 +29,7 @@ define i64 @add_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -42,7 +42,7 @@ define i64 @mul_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -55,7 +55,7 @@ define i64 @and_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -68,7 +68,7 @@ define i64 @or_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -81,7 +81,7 @@ define i64 @xor_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -97,7 +97,7 @@ define float @fadd_f32(<4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %vec)
   ret float %r
 }
 
@@ -113,7 +113,7 @@ define float @fadd_f32_accum(float %accum, <4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -131,7 +131,7 @@ define float @fadd_f32_strict(<4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
   ret float %r
 }
 
@@ -149,7 +149,7 @@ define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -165,7 +165,7 @@ define float @fmul_f32(<4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %vec)
   ret float %r
 }
 
@@ -181,7 +181,7 @@ define float @fmul_f32_accum(float %accum, <4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -199,7 +199,7 @@ define float @fmul_f32_strict(<4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
   ret float %r
 }
 
@@ -217,7 +217,7 @@ define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {
 ; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
-  %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
   ret float %r
 }
 
@@ -231,7 +231,7 @@ define i64 @smax_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -245,7 +245,7 @@ define i64 @smin_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -259,7 +259,7 @@ define i64 @umax_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -273,7 +273,7 @@ define i64 @umin_i64(<2 x i64> %vec) {
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %vec)
+  %r = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %vec)
   ret i64 %r
 }
 
@@ -282,11 +282,11 @@ entry:
 define double @fmax_f64(<2 x double> %vec) {
 ; CHECK-LABEL: @fmax_f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]])
 ; CHECK-NEXT:    ret double [[R]]
 ;
 entry:
-  %r = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %vec)
+  %r = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %vec)
   ret double %r
 }
 
@@ -295,11 +295,11 @@ entry:
 define double @fmin_f64(<2 x double> %vec) {
 ; CHECK-LABEL: @fmin_f64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]])
 ; CHECK-NEXT:    ret double [[R]]
 ;
 entry:
-  %r = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %vec)
+  %r = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %vec)
   ret double %r
 }
 
@@ -309,10 +309,10 @@ entry:
 define i8 @test_v3i8(<3 x i8> %a) nounwind {
 ; CHECK-LABEL: @test_v3i8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]])
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 entry:
-  %b = call i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a)
+  %b = call i8 @llvm.vector.reduce.and.i8.v3i8(<3 x i8> %a)
   ret i8 %b
 }

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
index 77e35ded50c1..5bc82d493abb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
@@ -44,7 +44,7 @@
     %add7 = add <4 x i32> %mul, %splat.output
     %max = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %add7, <4 x i32> %.splat.i42, i32 1, <4 x i1> %pred, <4 x i32> undef)
     %min = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %max, <4 x i32> %.splat.i, i32 1, <4 x i1> %pred, <4 x i32> undef)
-    %reduce = tail call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %min)
+    %reduce = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %min)
     store i32 %reduce, i32* %scevgep2
     %add.ptr = getelementptr inbounds i8, i8* %input_1_vect.addr.052, i32 4
     %add.ptr14 = getelementptr inbounds i8, i8* %input_2_vect.addr.051, i32 4
@@ -62,7 +62,7 @@
   declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
   declare i1 @llvm.test.set.loop.iterations.i32(i32) #4
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #4
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #5
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #5
 
 ...
 ---

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index 522cce49f75a..29ecf00c556f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -85,7 +85,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
+  %tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -188,7 +188,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -287,7 +287,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -386,7 +386,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
-  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -528,6 +528,6 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
index f27a98c3837f..ab3c866b015d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
@@ -56,7 +56,7 @@
     br i1 %tmp16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14)
+    %tmp17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -64,7 +64,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
index 5db355a86f23..b796712aa6ac 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
@@ -58,7 +58,7 @@
     br i1 %tmp16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14)
+    %tmp17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -66,7 +66,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
index ab2ffb50c6f8..2b354005535e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
@@ -68,7 +68,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %tmp12 = mul nsw <4 x i32> %pass, %tmp10
-    %tmp13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp12)
+    %tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12)
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
     %tmp16 = icmp ne i32 %tmp15, 0
@@ -105,7 +105,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %tmp12 = add nsw <4 x i32> %pass, %tmp10
-    %tmp13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp12)
+    %tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12)
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
     %tmp16 = icmp ne i32 %tmp15, 0
@@ -117,7 +117,7 @@
     ret i32 %res
   }
 
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
   declare void @llvm.set.loop.iterations.i32(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
index e5131fd4e1b4..cdf53b8666ee 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
@@ -40,7 +40,7 @@
     br i1 %15, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %16 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
+    %16 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -88,7 +88,7 @@
     br i1 %15, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %16 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
+    %16 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -98,7 +98,7 @@
 
   declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
   declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
-  declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
+  declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
index 886fbe7f0b17..f9d1abbee9e5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir
@@ -91,7 +91,7 @@
     %22 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %10)
     %23 = bitcast i16* %lsr.iv7 to i1*
     %24 = select <4 x i1> %22, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa
-    %25 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %24)
+    %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
     %sunkaddr = mul i32 %i.064.us, 4
     %26 = bitcast i32* %e to i8*
     %sunkaddr17 = getelementptr inbounds i8, i8* %26, i32 %sunkaddr
@@ -141,7 +141,7 @@
   }
   declare dso_local arm_aapcs_vfpcc signext i16 @crc16(...) local_unnamed_addr #0
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 2b90065ea0e8..5c3af352782b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -69,7 +69,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -145,7 +145,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -221,7 +221,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -297,7 +297,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -371,7 +371,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %vec.phi
-  %7 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %6)
+  %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -1273,6 +1273,6 @@ declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index e5bcf2e6077f..c797e0401d4f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -51,7 +51,7 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
 ; CHECK-NEXT:    store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[INC10_US]] = add nuw i32 [[I_025_US]], 1
 ; CHECK-NEXT:    [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]]
@@ -112,7 +112,7 @@ vector.body:                                      ; preds = %vector.body, %for.c
 
 middle.block:                                     ; preds = %vector.body
   %tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi
-  %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17)
+  %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17)
   store i32 %tmp18, i32* %arrayidx8.us, align 4
   %inc10.us = add nuw i32 %i.025.us, 1
   %exitcond27 = icmp eq i32 %inc10.us, %N
@@ -170,7 +170,7 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
 ; CHECK-NEXT:    store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4
 ; CHECK-NEXT:    [[INC9_US]] = add nuw i32 [[I_024_US]], 1
 ; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]]
@@ -229,7 +229,7 @@ vector.body:                                      ; preds = %vector.body, %for.c
 
 middle.block:                                     ; preds = %vector.body
   %tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi
-  %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp15)
+  %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp15)
   store i32 %tmp16, i32* %arrayidx7.us, align 4
   %inc9.us = add nuw i32 %i.024.us, 1
   %exitcond26 = icmp eq i32 %inc9.us, %N
@@ -247,7 +247,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #0
 
 ; Function Attrs: nounwind readnone willreturn
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #1
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1
 
 ; Function Attrs: noduplicate nounwind
 declare void @llvm.set.loop.iterations.i32(i32) #2

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
index 9eb95d7e8072..4a5f48331090 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
@@ -40,7 +40,7 @@
     br i1 %tmp15, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp13)
+    %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp13)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -48,7 +48,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
index 65f9cc3176b1..c27a6c32f5b3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir
@@ -44,7 +44,7 @@
     %.lcssa = phi <16 x i8> [ %13, %vector.body ]
     %16 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7)
     %17 = select <16 x i1> %16, <16 x i8> %.lcssa, <16 x i8> %vec.phi.lcssa
-    %18 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %17)
+    %18 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %17)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -53,7 +53,7 @@
   }
 
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
-  declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #2
+  declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir
index 966bdc913c46..3a098f272cc0 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir
@@ -36,7 +36,7 @@
     br i1 %cmp, label %for.body, label %middle.block
 
   middle.block:                                     ; preds = %for.body
-    %reduce = tail call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %acc.next)
+    %reduce = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %acc.next)
     ret i16 %reduce
 
   for.cond.cleanup:                                 ; preds = %entry
@@ -47,7 +47,7 @@
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) #2
   declare i1 @llvm.test.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #4
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #4
   declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
 
 ...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
index f013cb2f8615..c3655baeb8b1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
@@ -41,7 +41,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -88,7 +88,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -135,7 +135,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -182,7 +182,7 @@
     br i1 %16, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -228,7 +228,7 @@
     br i1 %14, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+    %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -274,7 +274,7 @@
     br i1 %14, label %vector.body, label %middle.block
 
   middle.block:                                     ; preds = %vector.body
-    %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+    %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -285,7 +285,7 @@
   declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
   declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index a0cdb822b370..f9116634a4d5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -45,7 +45,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16
   %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi
-  %i6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i5)
+  %i6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i5)
   %index.next = add i32 %index, 16
   %i7 = icmp eq i32 %index.next, %n.vec
   br i1 %i7, label %middle.block, label %vector.body
@@ -123,7 +123,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
-  %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9)
+  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -193,7 +193,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
-  %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7)
+  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -265,7 +265,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
-  %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9)
+  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -335,7 +335,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
-  %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7)
+  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -407,7 +407,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
-  %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9)
+  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -504,7 +504,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i9 = select <4 x i1> %active.lane.mask, <4 x i32> %i7, <4 x i32> %vec.phi
-  %i10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i9)
+  %i10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i9)
   br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph47
 
 vector.ph47:                                      ; preds = %middle.block
@@ -534,7 +534,7 @@ vector.body46:                                    ; preds = %vector.body46, %vec
 
 middle.block44:                                   ; preds = %vector.body46
   %i21 = select <4 x i1> %active.lane.mask61, <4 x i32> %i19, <4 x i32> %vec.phi60
-  %i22 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i21)
+  %i22 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i21)
   br label %for.cond.cleanup7
 
 for.cond.cleanup7:                                ; preds = %middle.block44, %middle.block, %entry
@@ -620,9 +620,9 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i11 = select <8 x i1> %active.lane.mask, <8 x i16> %i8, <8 x i16> %vec.phi
-  %i12 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i11)
+  %i12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i11)
   %i13 = select <8 x i1> %active.lane.mask, <8 x i16> %i9, <8 x i16> %vec.phi.1
-  %i14 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i13)
+  %i14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i13)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -747,7 +747,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %10 = select <4 x i1> %active.lane.mask, <4 x i32> %8, <4 x i32> %vec.phi
-  %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %10)
+  %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
   br label %for.end
 
 for.end:                                          ; preds = %middle.block, %lor.end
@@ -758,10 +758,10 @@ for.end:                                          ; preds = %middle.block, %lor.
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
index 17acf67a112b..497f0412589b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir
@@ -46,7 +46,7 @@
     %.lcssa = phi <4 x i32> [ %15, %vector.body ], !dbg !38
     %18 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %9), !dbg !34
     %19 = select <4 x i1> %18, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa, !dbg !38
-    %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19), !dbg !32
+    %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19), !dbg !32
     br label %for.cond.cleanup, !dbg !42
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -58,7 +58,7 @@
 
   declare void @llvm.dbg.value(metadata, metadata, metadata)
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index 338c980eeb9b..d786209ad3fb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -258,7 +258,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %19 = select <4 x i1> %active.lane.mask, <4 x i32> %16, <4 x i32> %vec.phi
-  %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19)
+  %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
   br label %for.end
 
 for.end:                                          ; preds = %middle.block, %for.body
@@ -282,6 +282,6 @@ declare void @llvm.set.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
index 26be5328027a..6d140589287d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@@ -74,14 +74,14 @@ vector.body:                                      ; preds = %vector.body, %entry
   br i1 %8, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7)
-  %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5)
+  %9 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %7)
+  %10 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %5)
   store i32 %10, i32* %minp, align 4
   ret i32 %9
 }
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) #3
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) #3
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
index 1f212c9e3aa3..dec54006c7d9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir
@@ -26,7 +26,7 @@
     %tmp8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %tmp7)
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
-    %min = tail call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %wide.masked.load)
+    %min = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %wide.masked.load)
     store i16 %min, i16* %lsr.iv.2
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 8
     %scevgep.2 = getelementptr i16, i16* %lsr.iv.2, i32 1
@@ -43,7 +43,7 @@
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)
-  declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
+  declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
 
 ...
 ---

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
index cd8310c1004b..1d9f7d72877e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir
@@ -26,7 +26,7 @@
     %tmp9 = sub i32 %tmp7, 4
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp10)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp10)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
@@ -64,7 +64,7 @@
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
     %sext = sext <8 x i16> %wide.masked.load to <8 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %sext)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sext)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 8
@@ -102,7 +102,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %sext = sext <16 x i8> %wide.masked.load to <16 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %sext)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %sext)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 16
@@ -140,7 +140,7 @@
     %tmp9 = sub i32 %tmp7, 4
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp10)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp10)
     %acc.next = add i32 %tmp11, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
@@ -179,7 +179,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
@@ -218,7 +218,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     %acc.next = add i32 %tmp11, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
@@ -257,7 +257,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = zext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     store i32 %tmp11, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
@@ -296,7 +296,7 @@
     %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
     %tmp10 = zext <4 x i16> %wide.masked.load to <4 x i32>
     %not = xor <4 x i32> %tmp10, <i32 -1, i32 -1, i32 -1, i32 -1>
-    %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not)
+    %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not)
     %acc.next = add i32 %tmp11, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
@@ -335,7 +335,7 @@
     %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv17, i32 1, <8 x i1> %tmp8, <8 x i8> undef)
     %sext.wide = sext <8 x i8> %wide.masked.load to <8 x i16>
     %sub = sub <8 x i16> %sext.wide, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %sext.reduce = sext i16 %reduce to i32
     store i32 %sext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -375,7 +375,7 @@
     %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv17, i32 1, <8 x i1> %tmp8, <8 x i8> undef)
     %sext.wide = sext <8 x i8> %wide.masked.load to <8 x i16>
     %sub = sub <8 x i16> %sext.wide, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %sext.reduce = sext i16 %reduce to i32
     %acc.next = add i32 %sext.reduce, %acc
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 8
@@ -414,7 +414,7 @@
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
     %sub = sub <8 x i16> %wide.masked.load, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %zext.reduce = zext i16 %reduce to i32
     store i32 %zext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -453,7 +453,7 @@
     %tmp9 = sub i32 %tmp7, 8
     %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef)
     %sub = sub <8 x i16> %wide.masked.load, %pass
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub)
     %zext.reduce = zext i16 %reduce to i32
     %acc.next = add i32 %zext.reduce, %acc
     %scevgep = getelementptr i16, i16* %lsr.iv, i32 8
@@ -492,7 +492,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %sext.reduce = sext i8 %reduce to i32
     store i32 %sext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -531,7 +531,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %sext.reduce = sext i8 %reduce to i32
     %acc.next = add i32 %sext.reduce, %acc
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 16
@@ -570,7 +570,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %zext.reduce = zext i8 %reduce to i32
     store i32 %zext.reduce, i32* %store.addr
     %store.next = getelementptr i32, i32* %store.addr, i32 1
@@ -609,7 +609,7 @@
     %tmp9 = sub i32 %tmp7, 16
     %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef)
     %xor = xor <16 x i8> %wide.masked.load, %pass
-    %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor)
+    %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor)
     %zext.reduce = zext i8 %reduce to i32
     %acc.next = add i32 %zext.reduce, %acc
     %scevgep = getelementptr i8, i8* %lsr.iv, i32 16
@@ -652,7 +652,7 @@
     %tmp4 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp3, i32 2, <4 x i1> %tmp, <4 x i16> zeroinitializer)
     %zext.wide.2 = zext <4 x i16> %tmp4 to <4 x i32>
     %or = or <4 x i32> %zext.wide.1, %zext.wide.2
-    %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %or)
+    %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %or)
     %acc.next = add i32 %reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 4
     %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 4
@@ -693,7 +693,7 @@
     %tmp2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp1, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %tmp4 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp3, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %or = or <8 x i16> %tmp2, %tmp4
-    %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %or)
+    %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %or)
     %zext.reduce = zext i16 %reduce to i32
     %acc.next = add i32 %zext.reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8
@@ -737,7 +737,7 @@
     %tmp5 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 1)
     %tmp6 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 0)
     %mul = add <4 x i32> %tmp5, %tmp6
-    %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %mul)
+    %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
     %acc.next = add i32 %reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8
     %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 8
@@ -778,7 +778,7 @@
     %tmp2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp1, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %tmp4 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp3, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer)
     %mul = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 1)
-    %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %mul)
+    %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul)
     %acc.next = add i32 %reduce, %acc
     %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8
     %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 8
@@ -798,11 +798,11 @@
   declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
   declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-  declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-  declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-  declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+  declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+  declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 15aed3bd4e17..7aa772c6394b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -214,7 +214,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %i19 = select <4 x i1> %active.lane.mask, <4 x i32> %i16, <4 x i32> %vec.phi
-  %i20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i19)
+  %i20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i19)
   br label %for.end
 
 for.end:                                          ; preds = %middle.block, %for.body
@@ -235,6 +235,6 @@ for.end17:                                        ; preds = %for.end, %entry
 
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare void @llvm.set.loop.iterations.i32(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
index 4f80869de3cc..4308c7e50eda 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
@@ -47,7 +47,7 @@
     %15 = add i32 %8, 4
     %16 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %15)
     %17 = select <4 x i1> %16, <4 x i32> %12, <4 x i32> %vec.phi
-    %18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %17)
+    %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %17)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -55,7 +55,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <4 x i1> @llvm.arm.mve.vctp32(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
index a42c33e24f26..9799ceb98c67 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir
@@ -46,7 +46,7 @@
     %.lcssa = phi <8 x i16> [ %15, %vector.body ]
     %18 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7)
     %19 = select <8 x i1> %18, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa
-    %20 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %19)
+    %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -54,7 +54,7 @@
     ret i16 %a.0.lcssa
   }
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 6628df20f202..422fc3c874da 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -70,7 +70,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
-  %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8)
+  %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -141,7 +141,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
-  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -212,7 +212,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
-  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -459,7 +459,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i
 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index 64e7552b92b3..c05ed7dcfcfb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -16,7 +16,7 @@
 
 ; CHECK: middle.block:
 ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
-; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
+; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
 
 define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
 entry:
@@ -64,7 +64,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
 middle.block:                                     ; preds = %vector.body
   %12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi
-  %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -73,7 +73,7 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 }
 
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare void @llvm.set.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
index 23cdf73263b0..07c136ec7c37 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
@@ -118,7 +118,7 @@
   middle.block:                                     ; preds = %vector.body
     %8 = call <4 x i1> @llvm.arm.vctp32(i32 %5)
     %tmp8 = select <4 x i1> %8, <4 x i32> %tmp6, <4 x i32> %vec.phi
-    %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
+    %tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -134,7 +134,7 @@
   declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
   declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
   declare <16 x i1> @llvm.arm.vctp8(i32)
   declare void @llvm.stackprotector(i8*, i8**)
   declare <8 x i1> @llvm.arm.vctp16(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
index fc0aa2020df9..fa7304ebe6ba 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir
@@ -46,7 +46,7 @@
     %.lcssa = phi <8 x i16> [ %15, %vector.body ]
     %18 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7)
     %19 = select <8 x i1> %18, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa
-    %20 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %19)
+    %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -54,7 +54,7 @@
     ret i16 %a.0.lcssa
   }
   declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
-  declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
+  declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
   declare void @llvm.set.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
   declare <8 x i1> @llvm.arm.mve.vctp16(i32)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
index d91556e3e70b..7ef303a1a949 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
@@ -52,7 +52,7 @@
     %n.splat = shufflevector <4 x i32> %insert.n, <4 x i32> undef, <4 x i32> zeroinitializer
     %tmp16 = icmp ult <4 x i32> %idx.splat, %n.splat
     %tmp17 = select <4 x i1> %tmp16, <4 x i32> %tmp13, <4 x i32> %vec.phi
-    %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17)
+    %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -60,7 +60,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
index 337816146e5f..00abf1603fb6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
@@ -45,7 +45,7 @@
   middle.block:                                     ; preds = %vector.body
     %15 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %8)
     %16 = select <4 x i1> %15, <4 x i32> %12, <4 x i32> %vec.phi
-    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %16)
+    %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %16)
     br label %for.cond.cleanup
 
   for.cond.cleanup:                                 ; preds = %middle.block, %entry
@@ -53,7 +53,7 @@
     ret i32 %res.0.lcssa
   }
   declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
-  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
+  declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
   declare void @llvm.set.loop.iterations.i32(i32) #3
   declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
   declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index d158c85e401b..e06ec427599d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -572,7 +572,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %10, label %middle.block, label %vector.body, !llvm.loop !7
 
 middle.block:                                     ; preds = %vector.body
-  %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
 ;for.cond8.for.cond.cleanup10_crit_edge.us.us:     ; preds = %for.body11.us.us, %middle.block
   %add19.us.us = add i32 %j.051.us.us, %mul18.us
   %arrayidx20.us.us = getelementptr inbounds i32, i32* %C, i32 %add19.us.us
@@ -803,7 +803,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %12, label %middle.block, label %vector.body, !llvm.loop !7
 
 middle.block:                                     ; preds = %vector.body
-  %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %11)
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11)
   br i1 %cmp.n, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us.preheader
 
 for.cond5.for.cond.cleanup7_crit_edge.us.us:      ; preds = %for.body8.us.us, %middle.block
@@ -1065,7 +1065,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.gather75 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %tmp85, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
   %tmp86 = sext <4 x i8> %wide.masked.gather75 to <4 x i32>
   %tmp87 = mul nsw <4 x i32> %tmp84, %tmp86
-  %tmp88 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp87)
+  %tmp88 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp87)
   %tmp89 = add i32 %tmp88, %vec.phi
   %index.next = add i32 %index, 4
   %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
@@ -1091,7 +1091,7 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <
 declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>) #3
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare void @llvm.memset.p0i8.i32(i8* align 2, i8, i32, i1)
 
 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
index a4a67512b719..5c32f37455ec 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -62,7 +62,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   br i1 %8, label %middle.block, label %vector.body
 middle.block:                                     ; preds = %vector.body
   %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
-  %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
   store i32 %10, i32* %arrayidx.us.us, align 4
   %inc21.us.us = add nuw i32 4, 1
   %exitcond81.not = icmp eq i32 %inc21.us.us, %n
@@ -139,7 +139,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   br i1 %8, label %middle.block, label %vector.body
 middle.block:                                     ; preds = %vector.body
   %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
-  %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
   store i32 %10, i32* %arrayidx.us.us, align 4
   %inc21.us.us = add nuw i32 4, 1
   %exitcond81.not = icmp eq i32 %inc21.us.us, %n
@@ -210,7 +210,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   br i1 %8, label %middle.block, label %vector.body
 middle.block:                                     ; preds = %vector.body
   %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
-  %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9)
+  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
   store i32 %10, i32* %arrayidx.us.us, align 4
   %inc21.us.us = add nuw i32 4, 1
   %exitcond81.not = icmp eq i32 %inc21.us.us, %n
@@ -440,7 +440,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %for.b
   ret void
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index d67ccd9393cc..b710912e808d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -1390,7 +1390,7 @@ declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4
 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i8(<16 x i32> %ext4)
+declare i32 @llvm.vector.reduce.add.v16i8(<16 x i32> %ext4)
 declare i32 @llvm.arm.mve.vmldava.v8i16(i32, i32, i32, i32, <8 x i16>, <8 x i16>)
 declare i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32, i32, i32, i32, <16 x i8>, <16 x i8>, <16 x i1>)
 declare i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
index e3f236bedc04..d4a04567f099 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
 
-declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
-declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
-declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
-declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
+declare i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8>)
 
 define arm_aapcs_vfpcc i64 @vaddv_v2i64_i64(<2 x i64> %s1) {
 ; CHECK-LABEL: vaddv_v2i64_i64:
@@ -20,7 +20,7 @@ define arm_aapcs_vfpcc i64 @vaddv_v2i64_i64(<2 x i64> %s1) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
+  %r = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
   ret i64 %r
 }
 
@@ -30,7 +30,7 @@ define arm_aapcs_vfpcc i32 @vaddv_v4i32_i32(<4 x i32> %s1) {
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -41,7 +41,7 @@ define arm_aapcs_vfpcc i32 @vaddv_v8i32_i32(<8 x i32> %s1) {
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
   ret i32 %r
 }
 
@@ -51,7 +51,7 @@ define arm_aapcs_vfpcc i16 @vaddv_v8i16_i16(<8 x i16> %s1) {
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -62,7 +62,7 @@ define arm_aapcs_vfpcc i16 @vaddv_v16i16_i16(<16 x i16> %s1) {
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
   ret i16 %r
 }
 
@@ -72,7 +72,7 @@ define arm_aapcs_vfpcc i8 @vaddv_v16i8_i8(<16 x i8> %s1) {
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -83,7 +83,7 @@ define arm_aapcs_vfpcc i8 @vaddv_v32i8_i8(<32 x i8> %s1) {
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %r = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
   ret i8 %r
 }
 
@@ -102,7 +102,7 @@ define arm_aapcs_vfpcc i64 @vaddva_v2i64_i64(<2 x i64> %s1, i64 %x) {
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
-  %t = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
+  %t = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1)
   %r = add i64 %t, %x
   ret i64 %r
 }
@@ -113,7 +113,7 @@ define arm_aapcs_vfpcc i32 @vaddva_v4i32_i32(<4 x i32> %s1, i32 %x) {
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
+  %t = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %s1)
   %r = add i32 %t, %x
   ret i32 %r
 }
@@ -125,7 +125,7 @@ define arm_aapcs_vfpcc i32 @vaddva_v8i32_i32(<8 x i32> %s1, i32 %x) {
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
+  %t = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
   %r = add i32 %t, %x
   ret i32 %r
 }
@@ -136,7 +136,7 @@ define arm_aapcs_vfpcc i16 @vaddva_v8i16_i16(<8 x i16> %s1, i16 %x) {
 ; CHECK-NEXT:    vaddva.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
+  %t = call i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16> %s1)
   %r = add i16 %t, %x
   ret i16 %r
 }
@@ -148,7 +148,7 @@ define arm_aapcs_vfpcc i16 @vaddva_v16i16_i16(<16 x i16> %s1, i16 %x) {
 ; CHECK-NEXT:    vaddva.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
+  %t = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
   %r = add i16 %t, %x
   ret i16 %r
 }
@@ -159,7 +159,7 @@ define arm_aapcs_vfpcc i8 @vaddva_v16i8_i8(<16 x i8> %s1, i8 %x) {
 ; CHECK-NEXT:    vaddva.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
+  %t = call i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8> %s1)
   %r = add i8 %t, %x
   ret i8 %r
 }
@@ -171,7 +171,7 @@ define arm_aapcs_vfpcc i8 @vaddva_v32i8_i8(<32 x i8> %s1, i8 %x) {
 ; CHECK-NEXT:    vaddva.u8 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %t = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
+  %t = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
   %r = add i8 %t, %x
   ret i8 %r
 }

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
index 35eecabb75ff..df2cb4361f2f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
@@ -7,7 +7,7 @@ define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x) {
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -18,7 +18,7 @@ define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   ret i64 %z
 }
 
@@ -29,7 +29,7 @@ define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   ret i64 %z
 }
 
@@ -47,7 +47,7 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -65,7 +65,7 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -76,7 +76,7 @@ define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   ret i32 %z
 }
 
@@ -87,7 +87,7 @@ define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   ret i32 %z
 }
 
@@ -99,7 +99,7 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -111,7 +111,7 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -122,7 +122,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -175,7 +175,7 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   ret i64 %z
 }
 
@@ -242,7 +242,7 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   ret i64 %z
 }
 
@@ -258,7 +258,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -278,7 +278,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -289,7 +289,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   ret i32 %z
 }
 
@@ -300,7 +300,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   ret i32 %z
 }
 
@@ -313,7 +313,7 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -326,7 +326,7 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   ret i32 %z
 }
 
@@ -338,7 +338,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   ret i16 %z
 }
 
@@ -350,7 +350,7 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   ret i16 %z
 }
 
@@ -363,7 +363,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   ret i16 %z
 }
 
@@ -376,7 +376,7 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   ret i16 %z
 }
 
@@ -387,7 +387,7 @@ define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
 ; CHECK-NEXT:    uxtb r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -492,7 +492,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   ret i64 %z
 }
 
@@ -627,7 +627,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   ret i64 %z
 }
 
@@ -643,7 +643,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -663,7 +663,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -678,7 +678,7 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x) {
 ; CHECK-NEXT:    adcs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -688,7 +688,7 @@ define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -700,7 +700,7 @@ define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -712,7 +712,7 @@ define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -735,7 +735,7 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -756,7 +756,7 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -768,7 +768,7 @@ define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -780,7 +780,7 @@ define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -793,7 +793,7 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -806,7 +806,7 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -818,7 +818,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -876,7 +876,7 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -948,7 +948,7 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -967,7 +967,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -990,7 +990,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1002,7 +1002,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1014,7 +1014,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1028,7 +1028,7 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1042,7 +1042,7 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1055,7 +1055,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1068,7 +1068,7 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1082,7 +1082,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a)
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1096,7 +1096,7 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a)
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1108,7 +1108,7 @@ define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
 ; CHECK-NEXT:    uxtb r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -1218,7 +1218,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1358,7 +1358,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1377,7 +1377,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1400,7 +1400,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    bx lr
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1420,18 +1420,18 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index 0f3aacf11d8c..e59fb0bb1ef4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -10,7 +10,7 @@ define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %b) {
 entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -24,7 +24,7 @@ entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = zext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -38,7 +38,7 @@ entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = sext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -73,7 +73,7 @@ entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = zext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -114,7 +114,7 @@ entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = sext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -128,7 +128,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -142,7 +142,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -158,7 +158,7 @@ entry:
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = zext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -174,7 +174,7 @@ entry:
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = sext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -188,7 +188,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %b)
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -314,7 +314,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -456,7 +456,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -492,7 +492,7 @@ entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = zext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -537,7 +537,7 @@ entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = sext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -551,7 +551,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -565,7 +565,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -582,7 +582,7 @@ entry:
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = zext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -600,7 +600,7 @@ entry:
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = sext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -615,7 +615,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -630,7 +630,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -647,7 +647,7 @@ entry:
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = zext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -664,7 +664,7 @@ entry:
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = sext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -678,7 +678,7 @@ define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %b) {
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   ret i8 %z
 }
 
@@ -948,7 +948,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1257,7 +1257,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1293,7 +1293,7 @@ entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = zext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1338,7 +1338,7 @@ entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = sext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1372,7 +1372,7 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) {
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1385,7 +1385,7 @@ define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %b, i32
 entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1400,7 +1400,7 @@ entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = zext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1415,7 +1415,7 @@ entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %xx = sext <4 x i32> %x to <4 x i64>
   %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1455,7 +1455,7 @@ entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = zext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1501,7 +1501,7 @@ entry:
   %c = icmp eq <2 x i32> %b, zeroinitializer
   %xx = sext <2 x i32> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1516,7 +1516,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1531,7 +1531,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i32>
   %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1548,7 +1548,7 @@ entry:
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = zext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1565,7 +1565,7 @@ entry:
   %c = icmp eq <4 x i16> %b, zeroinitializer
   %xx = sext <4 x i16> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1580,7 +1580,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16>
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1711,7 +1711,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = zext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1858,7 +1858,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %xx = sext <8 x i16> %x to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1897,7 +1897,7 @@ entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = zext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1947,7 +1947,7 @@ entry:
   %c = icmp eq <2 x i16> %b, zeroinitializer
   %xx = sext <2 x i16> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1962,7 +1962,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1977,7 +1977,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1995,7 +1995,7 @@ entry:
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = zext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2014,7 +2014,7 @@ entry:
   %c = icmp eq <4 x i8> %b, zeroinitializer
   %xx = sext <4 x i8> %x to <4 x i32>
   %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2030,7 +2030,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2046,7 +2046,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i16>
   %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2064,7 +2064,7 @@ entry:
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = zext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2082,7 +2082,7 @@ entry:
   %c = icmp eq <8 x i8> %b, zeroinitializer
   %xx = sext <8 x i8> %x to <8 x i16>
   %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2097,7 +2097,7 @@ define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -2372,7 +2372,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = zext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2686,7 +2686,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %xx = sext <16 x i8> %x to <16 x i64>
   %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2725,7 +2725,7 @@ entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = zext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2775,7 +2775,7 @@ entry:
   %c = icmp eq <2 x i8> %b, zeroinitializer
   %xx = sext <2 x i8> %x to <2 x i64>
   %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2814,18 +2814,18 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %b, i64
 entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
index fc06181978b7..cf9c2b6b39f1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
@@ -9,7 +9,7 @@ define arm_aapcs_vfpcc i32 @and_v2i32(<2 x i32> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -25,7 +25,7 @@ define arm_aapcs_vfpcc i32 @and_v4i32(<4 x i32> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -42,7 +42,7 @@ define arm_aapcs_vfpcc i32 @and_v8i32(<8 x i32> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -58,7 +58,7 @@ define arm_aapcs_vfpcc i16 @and_v4i16(<4 x i16> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -76,7 +76,7 @@ define arm_aapcs_vfpcc i16 @and_v8i16(<8 x i16> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -95,7 +95,7 @@ define arm_aapcs_vfpcc i16 @and_v16i16(<16 x i16> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -113,7 +113,7 @@ define arm_aapcs_vfpcc i8 @and_v8i8(<8 x i8> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -133,7 +133,7 @@ define arm_aapcs_vfpcc i8 @and_v16i8(<16 x i8> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -154,7 +154,7 @@ define arm_aapcs_vfpcc i8 @and_v32i8(<32 x i8> %x) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -163,7 +163,7 @@ define arm_aapcs_vfpcc i64 @and_v1i64(<1 x i64> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -178,7 +178,7 @@ define arm_aapcs_vfpcc i64 @and_v2i64(<2 x i64> %x) {
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -194,7 +194,7 @@ define arm_aapcs_vfpcc i64 @and_v4i64(<4 x i64> %x) {
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -207,7 +207,7 @@ define arm_aapcs_vfpcc i32 @and_v2i32_acc(<2 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %x)
   %r = and i32 %y, %z
   ret i32 %r
 }
@@ -225,7 +225,7 @@ define arm_aapcs_vfpcc i32 @and_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %x)
   %r = and i32 %y, %z
   ret i32 %r
 }
@@ -244,7 +244,7 @@ define arm_aapcs_vfpcc i32 @and_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %x)
   %r = and i32 %y, %z
   ret i32 %r
 }
@@ -262,7 +262,7 @@ define arm_aapcs_vfpcc i16 @and_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %x)
   %r = and i16 %y, %z
   ret i16 %r
 }
@@ -282,7 +282,7 @@ define arm_aapcs_vfpcc i16 @and_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %x)
   %r = and i16 %y, %z
   ret i16 %r
 }
@@ -303,7 +303,7 @@ define arm_aapcs_vfpcc i16 @and_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %x)
   %r = and i16 %y, %z
   ret i16 %r
 }
@@ -323,7 +323,7 @@ define arm_aapcs_vfpcc i8 @and_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %x)
   %r = and i8 %y, %z
   ret i8 %r
 }
@@ -345,7 +345,7 @@ define arm_aapcs_vfpcc i8 @and_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %x)
   %r = and i8 %y, %z
   ret i8 %r
 }
@@ -368,7 +368,7 @@ define arm_aapcs_vfpcc i8 @and_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %x)
   %r = and i8 %y, %z
   ret i8 %r
 }
@@ -380,7 +380,7 @@ define arm_aapcs_vfpcc i64 @and_v1i64_acc(<1 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    ands r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %x)
   %r = and i64 %y, %z
   ret i64 %r
 }
@@ -398,7 +398,7 @@ define arm_aapcs_vfpcc i64 @and_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x)
   %r = and i64 %y, %z
   ret i64 %r
 }
@@ -417,7 +417,7 @@ define arm_aapcs_vfpcc i64 @and_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x)
   %r = and i64 %y, %z
   ret i64 %r
 }
@@ -430,7 +430,7 @@ define arm_aapcs_vfpcc i32 @or_v2i32(<2 x i32> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -446,7 +446,7 @@ define arm_aapcs_vfpcc i32 @or_v4i32(<4 x i32> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -463,7 +463,7 @@ define arm_aapcs_vfpcc i32 @or_v8i32(<8 x i32> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -479,7 +479,7 @@ define arm_aapcs_vfpcc i16 @or_v4i16(<4 x i16> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -497,7 +497,7 @@ define arm_aapcs_vfpcc i16 @or_v8i16(<8 x i16> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -516,7 +516,7 @@ define arm_aapcs_vfpcc i16 @or_v16i16(<16 x i16> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -534,7 +534,7 @@ define arm_aapcs_vfpcc i8 @or_v8i8(<8 x i8> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -554,7 +554,7 @@ define arm_aapcs_vfpcc i8 @or_v16i8(<16 x i8> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -575,7 +575,7 @@ define arm_aapcs_vfpcc i8 @or_v32i8(<32 x i8> %x) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -584,7 +584,7 @@ define arm_aapcs_vfpcc i64 @or_v1i64(<1 x i64> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -599,7 +599,7 @@ define arm_aapcs_vfpcc i64 @or_v2i64(<2 x i64> %x) {
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -615,7 +615,7 @@ define arm_aapcs_vfpcc i64 @or_v4i64(<4 x i64> %x) {
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -628,7 +628,7 @@ define arm_aapcs_vfpcc i32 @or_v2i32_acc(<2 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %x)
   %r = or i32 %y, %z
   ret i32 %r
 }
@@ -646,7 +646,7 @@ define arm_aapcs_vfpcc i32 @or_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %x)
   %r = or i32 %y, %z
   ret i32 %r
 }
@@ -665,7 +665,7 @@ define arm_aapcs_vfpcc i32 @or_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %x)
   %r = or i32 %y, %z
   ret i32 %r
 }
@@ -683,7 +683,7 @@ define arm_aapcs_vfpcc i16 @or_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %x)
   %r = or i16 %y, %z
   ret i16 %r
 }
@@ -703,7 +703,7 @@ define arm_aapcs_vfpcc i16 @or_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %x)
   %r = or i16 %y, %z
   ret i16 %r
 }
@@ -724,7 +724,7 @@ define arm_aapcs_vfpcc i16 @or_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %x)
   %r = or i16 %y, %z
   ret i16 %r
 }
@@ -744,7 +744,7 @@ define arm_aapcs_vfpcc i8 @or_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %x)
   %r = or i8 %y, %z
   ret i8 %r
 }
@@ -766,7 +766,7 @@ define arm_aapcs_vfpcc i8 @or_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %x)
   %r = or i8 %y, %z
   ret i8 %r
 }
@@ -789,7 +789,7 @@ define arm_aapcs_vfpcc i8 @or_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %x)
   %r = or i8 %y, %z
   ret i8 %r
 }
@@ -801,7 +801,7 @@ define arm_aapcs_vfpcc i64 @or_v1i64_acc(<1 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %x)
   %r = or i64 %y, %z
   ret i64 %r
 }
@@ -819,7 +819,7 @@ define arm_aapcs_vfpcc i64 @or_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x)
   %r = or i64 %y, %z
   ret i64 %r
 }
@@ -838,7 +838,7 @@ define arm_aapcs_vfpcc i64 @or_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x)
   %r = or i64 %y, %z
   ret i64 %r
 }
@@ -851,7 +851,7 @@ define arm_aapcs_vfpcc i32 @xor_v2i32(<2 x i32> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -867,7 +867,7 @@ define arm_aapcs_vfpcc i32 @xor_v4i32(<4 x i32> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -884,7 +884,7 @@ define arm_aapcs_vfpcc i32 @xor_v8i32(<8 x i32> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -900,7 +900,7 @@ define arm_aapcs_vfpcc i16 @xor_v4i16(<4 x i16> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -918,7 +918,7 @@ define arm_aapcs_vfpcc i16 @xor_v8i16(<8 x i16> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -937,7 +937,7 @@ define arm_aapcs_vfpcc i16 @xor_v16i16(<16 x i16> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -955,7 +955,7 @@ define arm_aapcs_vfpcc i8 @xor_v8i8(<8 x i8> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -975,7 +975,7 @@ define arm_aapcs_vfpcc i8 @xor_v16i8(<16 x i8> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -996,7 +996,7 @@ define arm_aapcs_vfpcc i8 @xor_v32i8(<32 x i8> %x) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -1005,7 +1005,7 @@ define arm_aapcs_vfpcc i64 @xor_v1i64(<1 x i64> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -1020,7 +1020,7 @@ define arm_aapcs_vfpcc i64 @xor_v2i64(<2 x i64> %x) {
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -1036,7 +1036,7 @@ define arm_aapcs_vfpcc i64 @xor_v4i64(<4 x i64> %x) {
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -1049,7 +1049,7 @@ define arm_aapcs_vfpcc i32 @xor_v2i32_acc(<2 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %x)
   %r = xor i32 %y, %z
   ret i32 %r
 }
@@ -1067,7 +1067,7 @@ define arm_aapcs_vfpcc i32 @xor_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %x)
   %r = xor i32 %y, %z
   ret i32 %r
 }
@@ -1086,7 +1086,7 @@ define arm_aapcs_vfpcc i32 @xor_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %x)
   %r = xor i32 %y, %z
   ret i32 %r
 }
@@ -1104,7 +1104,7 @@ define arm_aapcs_vfpcc i16 @xor_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %x)
   %r = xor i16 %y, %z
   ret i16 %r
 }
@@ -1124,7 +1124,7 @@ define arm_aapcs_vfpcc i16 @xor_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %x)
   %r = xor i16 %y, %z
   ret i16 %r
 }
@@ -1145,7 +1145,7 @@ define arm_aapcs_vfpcc i16 @xor_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %x)
   %r = xor i16 %y, %z
   ret i16 %r
 }
@@ -1165,7 +1165,7 @@ define arm_aapcs_vfpcc i8 @xor_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %x)
   %r = xor i8 %y, %z
   ret i8 %r
 }
@@ -1187,7 +1187,7 @@ define arm_aapcs_vfpcc i8 @xor_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %x)
   %r = xor i8 %y, %z
   ret i8 %r
 }
@@ -1210,7 +1210,7 @@ define arm_aapcs_vfpcc i8 @xor_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %x)
   %r = xor i8 %y, %z
   ret i8 %r
 }
@@ -1222,7 +1222,7 @@ define arm_aapcs_vfpcc i64 @xor_v1i64_acc(<1 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    eors r1, r3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %x)
   %r = xor i64 %y, %z
   ret i64 %r
 }
@@ -1240,7 +1240,7 @@ define arm_aapcs_vfpcc i64 @xor_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x)
   %r = xor i64 %y, %z
   ret i64 %r
 }
@@ -1259,44 +1259,44 @@ define arm_aapcs_vfpcc i64 @xor_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x)
   %r = xor i64 %y, %z
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
index 77f0c77033f9..8ead4f5cfebf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
@@ -9,7 +9,7 @@ define arm_aapcs_vfpcc float @fadd_v2f32(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -30,7 +30,7 @@ define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -56,7 +56,7 @@ define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %y, <8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -71,7 +71,7 @@ define arm_aapcs_vfpcc void @fadd_v2f16(<2 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half %y, <2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v2f16(half %y, <2 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -102,7 +102,7 @@ define arm_aapcs_vfpcc void @fadd_v4f16(<4 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -139,7 +139,7 @@ define arm_aapcs_vfpcc void @fadd_v8f16(<8 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -189,7 +189,7 @@ define arm_aapcs_vfpcc void @fadd_v16f16(<16 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half %y, <16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -200,7 +200,7 @@ define arm_aapcs_vfpcc double @fadd_v1f64(<1 x double> %x, double %y) {
 ; CHECK-NEXT:    vadd.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double %y, <1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -211,7 +211,7 @@ define arm_aapcs_vfpcc double @fadd_v2f64(<2 x double> %x, double %y) {
 ; CHECK-NEXT:    vadd.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %y, <2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -224,7 +224,7 @@ define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vadd.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %y, <4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
@@ -235,7 +235,7 @@ define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vadd.f32 s0, s4, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x)
+  %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -248,7 +248,7 @@ define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) {
 ; CHECK-NEXT:    vadd.f32 s0, s4, s3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x)
+  %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -265,7 +265,7 @@ define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) {
 ; CHECK-NEXT:    vadd.f32 s0, s0, s7
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %y, <8 x float> %x)
+  %z = call float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -283,7 +283,7 @@ define arm_aapcs_vfpcc void @fadd_v4f16_nofast(<4 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x)
+  %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -308,7 +308,7 @@ define arm_aapcs_vfpcc void @fadd_v8f16_nofast(<8 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x)
+  %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -345,7 +345,7 @@ define arm_aapcs_vfpcc void @fadd_v16f16_nofast(<16 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half %y, <16 x half> %x)
+  %z = call half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -356,7 +356,7 @@ define arm_aapcs_vfpcc double @fadd_v1f64_nofast(<1 x double> %x, double %y) {
 ; CHECK-NEXT:    vadd.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double %y, <1 x double> %x)
+  %z = call double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -367,7 +367,7 @@ define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) {
 ; CHECK-NEXT:    vadd.f64 d0, d2, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %y, <2 x double> %x)
+  %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -380,17 +380,17 @@ define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vadd.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %y, <4 x double> %x)
+  %z = call double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half, <2 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)
+declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v2f16(half, <2 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
index a83fa6882cb9..45c6972312e3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
@@ -8,7 +8,7 @@ define arm_aapcs_vfpcc float @fmin_v2f32(<2 x float> %x) {
 ; CHECK-NEXT:    vminnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -27,7 +27,7 @@ define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -60,7 +60,7 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s2, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -83,7 +83,7 @@ define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -112,7 +112,7 @@ define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -170,7 +170,7 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -179,7 +179,7 @@ define arm_aapcs_vfpcc double @fmin_v1f64(<1 x double> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -189,7 +189,7 @@ define arm_aapcs_vfpcc double @fmin_v2f64(<2 x double> %x) {
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -205,7 +205,7 @@ define arm_aapcs_vfpcc double @fmin_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -215,7 +215,7 @@ define arm_aapcs_vfpcc float @fmin_v2f32_nofast(<2 x float> %x) {
 ; CHECK-NEXT:    vminnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -234,7 +234,7 @@ define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -258,7 +258,7 @@ define arm_aapcs_vfpcc float @fmin_v8f32_nofast(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -281,7 +281,7 @@ define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -310,7 +310,7 @@ define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -352,7 +352,7 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -361,7 +361,7 @@ define arm_aapcs_vfpcc double @fmin_v1f64_nofast(<1 x double> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -371,7 +371,7 @@ define arm_aapcs_vfpcc double @fmin_v2f64_nofast(<2 x double> %x) {
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -383,7 +383,7 @@ define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) {
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -394,7 +394,7 @@ define arm_aapcs_vfpcc float @fmin_v2f32_acc(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   %c = fcmp fast olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -417,7 +417,7 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   %c = fcmp fast olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -453,7 +453,7 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   %c = fcmp fast olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -485,7 +485,7 @@ define arm_aapcs_vfpcc void @fmin_v4f16_acc(<4 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -503,7 +503,7 @@ define arm_aapcs_vfpcc void @fmin_v2f16_acc(<2 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v2f16(<2 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -542,7 +542,7 @@ define arm_aapcs_vfpcc void @fmin_v8f16_acc(<8 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -610,7 +610,7 @@ define arm_aapcs_vfpcc void @fmin_v16f16_acc(<16 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   %c = fcmp fast olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -623,7 +623,7 @@ define arm_aapcs_vfpcc double @fmin_v1f64_acc(<1 x double> %x, double %y) {
 ; CHECK-NEXT:    vminnm.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   %c = fcmp fast olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -636,7 +636,7 @@ define arm_aapcs_vfpcc double @fmin_v2f64_acc(<2 x double> %x, double %y) {
 ; CHECK-NEXT:    vminnm.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   %c = fcmp fast olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -655,7 +655,7 @@ define arm_aapcs_vfpcc double @fmin_v4f64_acc(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vminnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   %c = fcmp fast olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -670,7 +670,7 @@ define arm_aapcs_vfpcc float @fmin_v2f32_acc_nofast(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x)
   %c = fcmp olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -697,7 +697,7 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
   %c = fcmp olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -729,7 +729,7 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
   %c = fcmp olt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -765,7 +765,7 @@ define arm_aapcs_vfpcc void @fmin_v4f16_acc_nofast(<4 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
   %c = fcmp olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -808,7 +808,7 @@ define arm_aapcs_vfpcc void @fmin_v8f16_acc_nofast(<8 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
   %c = fcmp olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -864,7 +864,7 @@ define arm_aapcs_vfpcc void @fmin_v16f16_acc_nofast(<16 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
   %c = fcmp olt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -879,7 +879,7 @@ define arm_aapcs_vfpcc double @fmin_v1f64_acc_nofast(<1 x double> %x, double %y)
 ; CHECK-NEXT:    vselgt.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x)
   %c = fcmp olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -894,7 +894,7 @@ define arm_aapcs_vfpcc double @fmin_v2f64_acc_nofast(<2 x double> %x, double %y)
 ; CHECK-NEXT:    vselgt.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x)
   %c = fcmp olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -911,7 +911,7 @@ define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y)
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
   %c = fcmp olt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -923,7 +923,7 @@ define arm_aapcs_vfpcc float @fmax_v2f32(<2 x float> %x) {
 ; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -942,7 +942,7 @@ define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -974,7 +974,7 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s2, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -997,7 +997,7 @@ define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -1026,7 +1026,7 @@ define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -1084,7 +1084,7 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -1093,7 +1093,7 @@ define arm_aapcs_vfpcc double @fmax_v1f64(<1 x double> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -1103,7 +1103,7 @@ define arm_aapcs_vfpcc double @fmax_v2f64(<2 x double> %x) {
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -1119,7 +1119,7 @@ define arm_aapcs_vfpcc double @fmax_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -1129,7 +1129,7 @@ define arm_aapcs_vfpcc float @fmax_v2f32_nofast(<2 x float> %x) {
 ; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   ret float %z
 }
 
@@ -1148,7 +1148,7 @@ define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   ret float %z
 }
 
@@ -1172,7 +1172,7 @@ define arm_aapcs_vfpcc float @fmax_v8f32_nofast(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   ret float %z
 }
 
@@ -1195,7 +1195,7 @@ define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   ret half %z
 }
 
@@ -1224,7 +1224,7 @@ define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   ret half %z
 }
 
@@ -1266,7 +1266,7 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   ret half %z
 }
 
@@ -1275,7 +1275,7 @@ define arm_aapcs_vfpcc double @fmax_v1f64_nofast(<1 x double> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   ret double %z
 }
 
@@ -1285,7 +1285,7 @@ define arm_aapcs_vfpcc double @fmax_v2f64_nofast(<2 x double> %x) {
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   ret double %z
 }
 
@@ -1297,7 +1297,7 @@ define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) {
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   ret double %z
 }
 
@@ -1308,7 +1308,7 @@ define arm_aapcs_vfpcc float @fmax_v2f32_acc(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   %c = fcmp fast ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1331,7 +1331,7 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   %c = fcmp fast ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1367,7 +1367,7 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   %c = fcmp fast ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1384,7 +1384,7 @@ define arm_aapcs_vfpcc void @fmax_v2f16_acc(<2 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v2f16(<2 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1417,7 +1417,7 @@ define arm_aapcs_vfpcc void @fmax_v4f16_acc(<4 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1456,7 +1456,7 @@ define arm_aapcs_vfpcc void @fmax_v8f16_acc(<8 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1524,7 +1524,7 @@ define arm_aapcs_vfpcc void @fmax_v16f16_acc(<16 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   %c = fcmp fast ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1537,7 +1537,7 @@ define arm_aapcs_vfpcc double @fmax_v1f64_acc(<1 x double> %x, double %y) {
 ; CHECK-NEXT:    vmaxnm.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   %c = fcmp fast ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1550,7 +1550,7 @@ define arm_aapcs_vfpcc double @fmax_v2f64_acc(<2 x double> %x, double %y) {
 ; CHECK-NEXT:    vmaxnm.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   %c = fcmp fast ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1569,7 +1569,7 @@ define arm_aapcs_vfpcc double @fmax_v4f64_acc(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vmaxnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   %c = fcmp fast ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1584,7 +1584,7 @@ define arm_aapcs_vfpcc float @fmax_v2f32_acc_nofast(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x)
   %c = fcmp ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1611,7 +1611,7 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
   %c = fcmp ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1643,7 +1643,7 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
   %c = fcmp ogt float %y, %z
   %r = select i1 %c, float %y, float %z
   ret float %r
@@ -1679,7 +1679,7 @@ define arm_aapcs_vfpcc void @fmax_v4f16_acc_nofast(<4 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
   %c = fcmp ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1722,7 +1722,7 @@ define arm_aapcs_vfpcc void @fmax_v8f16_acc_nofast(<8 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
   %c = fcmp ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1778,7 +1778,7 @@ define arm_aapcs_vfpcc void @fmax_v16f16_acc_nofast(<16 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
   %c = fcmp ogt half %y, %z
   %r = select i1 %c, half %y, half %z
   store half %r, half* %yy
@@ -1793,7 +1793,7 @@ define arm_aapcs_vfpcc double @fmax_v1f64_acc_nofast(<1 x double> %x, double %y)
 ; CHECK-NEXT:    vselgt.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x)
   %c = fcmp ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1808,7 +1808,7 @@ define arm_aapcs_vfpcc double @fmax_v2f64_acc_nofast(<2 x double> %x, double %y)
 ; CHECK-NEXT:    vselgt.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x)
   %c = fcmp ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
@@ -1825,29 +1825,29 @@ define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y)
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
   %c = fcmp ogt double %y, %z
   %r = select i1 %c, double %y, double %z
   ret double %r
 }
 
-declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
-declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>)
+declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
+declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
+declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
+declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
index 89d1546932a4..940e2e2f4a0c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
@@ -9,7 +9,7 @@ define arm_aapcs_vfpcc float @fmul_v2f32(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %y, <2 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -30,7 +30,7 @@ define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -56,7 +56,7 @@ define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
-  %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %y, <8 x float> %x)
+  %z = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -71,7 +71,7 @@ define arm_aapcs_vfpcc void @fmul_v2f16(<2 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -102,7 +102,7 @@ define arm_aapcs_vfpcc void @fmul_v4f16(<4 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -139,7 +139,7 @@ define arm_aapcs_vfpcc void @fmul_v8f16(<8 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -189,7 +189,7 @@ define arm_aapcs_vfpcc void @fmul_v16f16(<16 x half> %x, half* %yy) {
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half %y, <16 x half> %x)
+  %z = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -200,7 +200,7 @@ define arm_aapcs_vfpcc double @fmul_v1f64(<1 x double> %x, double %y) {
 ; CHECK-NEXT:    vmul.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double %y, <1 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -211,7 +211,7 @@ define arm_aapcs_vfpcc double @fmul_v2f64(<2 x double> %x, double %y) {
 ; CHECK-NEXT:    vmul.f64 d0, d2, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %y, <2 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -224,7 +224,7 @@ define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vmul.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %y, <4 x double> %x)
+  %z = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
@@ -235,7 +235,7 @@ define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) {
 ; CHECK-NEXT:    vmul.f32 s0, s4, s1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %y, <2 x float> %x)
+  %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
@@ -248,7 +248,7 @@ define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) {
 ; CHECK-NEXT:    vmul.f32 s0, s4, s3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x)
+  %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
 }
 
@@ -265,7 +265,7 @@ define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) {
 ; CHECK-NEXT:    vmul.f32 s0, s0, s7
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %y, <8 x float> %x)
+  %z = call float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x)
   ret float %z
 }
 
@@ -280,7 +280,7 @@ define arm_aapcs_vfpcc void @fmul_v2f16_nofast(<2 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -299,7 +299,7 @@ define arm_aapcs_vfpcc void @fmul_v4f16_nofast(<4 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -324,7 +324,7 @@ define arm_aapcs_vfpcc void @fmul_v8f16_nofast(<8 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -361,7 +361,7 @@ define arm_aapcs_vfpcc void @fmul_v16f16_nofast(<16 x half> %x, half* %yy) {
 ; CHECK-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
-  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half %y, <16 x half> %x)
+  %z = call half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x)
   store half %z, half* %yy
   ret void
 }
@@ -372,7 +372,7 @@ define arm_aapcs_vfpcc double @fmul_v1f64_nofast(<1 x double> %x, double %y) {
 ; CHECK-NEXT:    vmul.f64 d0, d1, d0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double %y, <1 x double> %x)
+  %z = call double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x)
   ret double %z
 }
 
@@ -383,7 +383,7 @@ define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) {
 ; CHECK-NEXT:    vmul.f64 d0, d2, d1
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %y, <2 x double> %x)
+  %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
   ret double %z
 }
 
@@ -396,17 +396,17 @@ define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vmul.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %y, <4 x double> %x)
+  %z = call double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x)
   ret double %z
 }
 
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half, <16 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half, <2 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>)
-declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half, <8 x half>)
+declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v2f16(half, <2 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 286277945eb6..2544474932c9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -65,7 +65,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %2 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %wide.load)
+  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
   %3 = add i32 %2, %vec.phi
   %index.next = add i32 %index, 4
   %4 = icmp eq i32 %index.next, %n.vec
@@ -167,7 +167,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -267,7 +267,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -367,7 +367,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -467,7 +467,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %2)
+  %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -568,7 +568,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
+  %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -665,7 +665,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %3, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %4 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
+  %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -762,7 +762,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -852,7 +852,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
   %2 = icmp slt i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -958,7 +958,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1048,7 +1048,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
   %2 = icmp sgt i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -1154,7 +1154,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1244,7 +1244,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
   %2 = icmp ult i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -1350,7 +1350,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %3)
+  %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1440,7 +1440,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %0 = getelementptr inbounds i32, i32* %x, i32 %index
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
-  %l5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
+  %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
   %2 = icmp ugt i32 %vec.phi, %l5
   %3 = select i1 %2, i32 %vec.phi, i32 %l5
   %index.next = add i32 %index, 4
@@ -1553,7 +1553,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %3)
+  %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1658,7 +1658,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %4, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %5 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %3)
+  %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
   %cmp.n = icmp eq i32 %n.vec, %n
   br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
 
@@ -1722,7 +1722,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %1 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
-  %3 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %2)
+  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
   %4 = add i32 %3, %vec.phi
   %index.next = add i32 %index, 4
   %5 = icmp eq i32 %index.next, %n.vec
@@ -1777,7 +1777,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
   %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
-  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
   %7 = add i32 %6, %vec.phi
   %index.next = add i32 %index, 4
   %8 = icmp eq i32 %index.next, %n.vec
@@ -1828,7 +1828,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
   %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
-  %4 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %3)
+  %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
   %5 = add i32 %4, %vec.phi
   %index.next = add i32 %index, 8
   %6 = icmp eq i32 %index.next, %n.vec
@@ -1885,7 +1885,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
   %6 = mul nsw <8 x i32> %5, %2
   %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
   %9 = add i32 %8, %vec.phi
   %index.next = add i32 %index, 8
   %10 = icmp eq i32 %index.next, %n.vec
@@ -1936,7 +1936,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
   %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
-  %4 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %3)
+  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
   %5 = add i32 %4, %vec.phi
   %index.next = add i32 %index, 16
   %6 = icmp eq i32 %index.next, %n.vec
@@ -1993,7 +1993,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
   %6 = mul nuw nsw <16 x i32> %5, %2
   %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
-  %8 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %7)
+  %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
   %9 = add i32 %8, %vec.phi
   %index.next = add i32 %index, 16
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2043,7 +2043,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %1 = bitcast i16* %0 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
-  %3 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %2)
+  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
   %4 = add i16 %3, %vec.phi
   %index.next = add i32 %index, 8
   %5 = icmp eq i32 %index.next, %n.vec
@@ -2098,7 +2098,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
   %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
-  %6 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %5)
+  %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
   %7 = add i16 %6, %vec.phi
   %index.next = add i32 %index, 8
   %8 = icmp eq i32 %index.next, %n.vec
@@ -2149,7 +2149,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
   %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
-  %4 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %3)
+  %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
   %5 = add i16 %4, %vec.phi
   %index.next = add i32 %index, 16
   %6 = icmp eq i32 %index.next, %n.vec
@@ -2206,7 +2206,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
   %6 = mul nuw <16 x i16> %5, %2
   %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
-  %8 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %7)
+  %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
   %9 = add i16 %8, %vec.phi
   %index.next = add i32 %index, 16
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2256,7 +2256,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %1 = bitcast i8* %0 to <16 x i8>*
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
-  %3 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %2)
+  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
   %4 = add i8 %3, %vec.phi
   %index.next = add i32 %index, 16
   %5 = icmp eq i32 %index.next, %n.vec
@@ -2311,7 +2311,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
   %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
-  %6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %5)
+  %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
   %7 = add i8 %6, %vec.phi
   %index.next = add i32 %index, 16
   %8 = icmp eq i32 %index.next, %n.vec
@@ -2364,7 +2364,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
   %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
-  %4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %3)
+  %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
   %5 = add i64 %4, %vec.phi
   %index.next = add i32 %index, 4
   %6 = icmp eq i32 %index.next, %n.vec
@@ -2423,7 +2423,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
   %6 = mul nsw <4 x i64> %5, %2
   %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
-  %8 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %7)
+  %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
   %9 = add i64 %8, %vec.phi
   %index.next = add i32 %index, 4
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2482,7 +2482,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
   %6 = mul nsw <8 x i64> %5, %2
   %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
-  %8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %7)
+  %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
   %9 = add i64 %8, %vec.phi
   %index.next = add i32 %index, 8
   %10 = icmp eq i32 %index.next, %n.vec
@@ -2497,26 +2497,26 @@ declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) #3
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) #3
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #3
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) #3
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #3
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) #3
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) #3
-
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
index b83b51b6f564..ee15f82a71f5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -8,7 +8,7 @@ define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <4 x i32> %x, %y
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -21,7 +21,7 @@ entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   ret i64 %z
 }
 
@@ -34,7 +34,7 @@ entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   ret i64 %z
 }
 
@@ -53,7 +53,7 @@ entry:
   %xx = zext <2 x i32> %x to <2 x i64>
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -72,7 +72,7 @@ entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -85,7 +85,7 @@ entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   ret i32 %z
 }
 
@@ -98,7 +98,7 @@ entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   ret i32 %z
 }
 
@@ -113,7 +113,7 @@ entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -128,7 +128,7 @@ entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -140,7 +140,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y)
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <8 x i16> %x, %y
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   ret i16 %z
 }
 
@@ -153,7 +153,7 @@ entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   ret i64 %z
 }
 
@@ -166,7 +166,7 @@ entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   ret i64 %z
 }
 
@@ -180,7 +180,7 @@ entry:
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   ret i64 %z
 }
 
@@ -194,7 +194,7 @@ entry:
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   ret i64 %z
 }
 
@@ -207,7 +207,7 @@ entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   ret i64 %z
 }
 
@@ -228,7 +228,7 @@ entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -250,7 +250,7 @@ entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -263,7 +263,7 @@ entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   ret i32 %z
 }
 
@@ -276,7 +276,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   ret i32 %z
 }
 
@@ -290,7 +290,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   ret i32 %z
 }
 
@@ -304,7 +304,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   ret i32 %z
 }
 
@@ -317,7 +317,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   ret i32 %z
 }
 
@@ -333,7 +333,7 @@ entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -350,7 +350,7 @@ entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   ret i32 %z
 }
 
@@ -364,7 +364,7 @@ entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   ret i16 %z
 }
 
@@ -378,7 +378,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   ret i16 %z
 }
 
@@ -394,7 +394,7 @@ entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   ret i16 %z
 }
 
@@ -410,7 +410,7 @@ entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   ret i16 %z
 }
 
@@ -422,7 +422,7 @@ define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <16 x i8> %x, %y
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
   ret i8 %z
 }
 
@@ -636,7 +636,7 @@ entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   ret i64 %z
 }
 
@@ -803,7 +803,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   ret i64 %z
 }
 
@@ -826,7 +826,7 @@ entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -848,7 +848,7 @@ entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -879,7 +879,7 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %m = mul <2 x i64> %x, %y
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   ret i64 %z
 }
 
@@ -890,7 +890,7 @@ define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <4 x i32> %x, %y
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -904,7 +904,7 @@ entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -918,7 +918,7 @@ entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -942,7 +942,7 @@ entry:
   %xx = zext <2 x i32> %x to <2 x i64>
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -966,7 +966,7 @@ entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -980,7 +980,7 @@ entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -994,7 +994,7 @@ entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1010,7 +1010,7 @@ entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1026,7 +1026,7 @@ entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1039,7 +1039,7 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16>
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <8 x i16> %x, %y
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1053,7 +1053,7 @@ entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1067,7 +1067,7 @@ entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1082,7 +1082,7 @@ entry:
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1097,7 +1097,7 @@ entry:
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1111,7 +1111,7 @@ entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1137,7 +1137,7 @@ entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1164,7 +1164,7 @@ entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1178,7 +1178,7 @@ entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1192,7 +1192,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1207,7 +1207,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1222,7 +1222,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1236,7 +1236,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1253,7 +1253,7 @@ entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1271,7 +1271,7 @@ entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1286,7 +1286,7 @@ entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1301,7 +1301,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1318,7 +1318,7 @@ entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1335,7 +1335,7 @@ entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1348,7 +1348,7 @@ define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %
 ; CHECK-NEXT:    bx lr
 entry:
   %m = mul <16 x i8> %x, %y
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -1565,7 +1565,7 @@ entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1737,7 +1737,7 @@ entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1765,7 +1765,7 @@ entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1792,7 +1792,7 @@ entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1826,18 +1826,18 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %m = mul <2 x i64> %x, %y
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 02d124890c6b..72462bb87f02 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -11,7 +11,7 @@ entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %m = mul <4 x i32> %x, %y
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -27,7 +27,7 @@ entry:
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -43,7 +43,7 @@ entry:
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   ret i64 %z
 }
 
@@ -79,7 +79,7 @@ entry:
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -115,7 +115,7 @@ entry:
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -131,7 +131,7 @@ entry:
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -147,7 +147,7 @@ entry:
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   ret i32 %z
 }
 
@@ -166,7 +166,7 @@ entry:
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -185,7 +185,7 @@ entry:
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -200,7 +200,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %m = mul <8 x i16> %x, %y
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -216,7 +216,7 @@ entry:
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -232,7 +232,7 @@ entry:
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -249,7 +249,7 @@ entry:
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -266,7 +266,7 @@ entry:
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -282,7 +282,7 @@ entry:
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   ret i64 %z
 }
 
@@ -334,7 +334,7 @@ entry:
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -385,7 +385,7 @@ entry:
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -401,7 +401,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -417,7 +417,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -434,7 +434,7 @@ entry:
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -451,7 +451,7 @@ entry:
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -467,7 +467,7 @@ entry:
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   ret i32 %z
 }
 
@@ -487,7 +487,7 @@ entry:
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -509,7 +509,7 @@ entry:
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   ret i32 %z
 }
 
@@ -526,7 +526,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -543,7 +543,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   ret i16 %z
 }
 
@@ -563,7 +563,7 @@ entry:
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -583,7 +583,7 @@ entry:
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   ret i16 %z
 }
 
@@ -598,7 +598,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %m = mul <16 x i8> %x, %y
   %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   ret i8 %z
 }
 
@@ -1010,7 +1010,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1353,7 +1353,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   ret i64 %z
 }
 
@@ -1405,7 +1405,7 @@ entry:
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1456,7 +1456,7 @@ entry:
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1509,7 +1509,7 @@ entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   ret i64 %z
 }
 
@@ -1523,7 +1523,7 @@ entry:
   %c = icmp eq <4 x i32> %b, zeroinitializer
   %m = mul <4 x i32> %x, %y
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1540,7 +1540,7 @@ entry:
   %yy = zext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1557,7 +1557,7 @@ entry:
   %yy = sext <4 x i32> %y to <4 x i64>
   %m = mul <4 x i64> %xx, %yy
   %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1598,7 +1598,7 @@ entry:
   %yy = zext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1639,7 +1639,7 @@ entry:
   %yy = sext <2 x i32> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1656,7 +1656,7 @@ entry:
   %yy = zext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1673,7 +1673,7 @@ entry:
   %yy = sext <8 x i16> %y to <8 x i32>
   %m = mul <8 x i32> %xx, %yy
   %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1693,7 +1693,7 @@ entry:
   %yy = zext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1713,7 +1713,7 @@ entry:
   %yy = sext <4 x i16> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1729,7 +1729,7 @@ entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
   %m = mul <8 x i16> %x, %y
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -1746,7 +1746,7 @@ entry:
   %yy = zext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1763,7 +1763,7 @@ entry:
   %yy = sext <8 x i16> %y to <8 x i64>
   %m = mul <8 x i64> %xx, %yy
   %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1781,7 +1781,7 @@ entry:
   %m = mul <8 x i32> %xx, %yy
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1799,7 +1799,7 @@ entry:
   %m = mul <8 x i32> %xx, %yy
   %ma = sext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1816,7 +1816,7 @@ entry:
   %m = mul <8 x i32> %xx, %xx
   %ma = zext <8 x i32> %m to <8 x i64>
   %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1873,7 +1873,7 @@ entry:
   %yy = zext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1929,7 +1929,7 @@ entry:
   %yy = sext <2 x i16> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -1946,7 +1946,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1963,7 +1963,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i32>
   %m = mul <16 x i32> %xx, %yy
   %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1981,7 +1981,7 @@ entry:
   %m = mul <16 x i16> %xx, %yy
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -1999,7 +1999,7 @@ entry:
   %m = mul <16 x i16> %xx, %yy
   %ma = sext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2016,7 +2016,7 @@ entry:
   %m = mul <16 x i16> %xx, %xx
   %ma = zext <16 x i16> %m to <16 x i32>
   %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2037,7 +2037,7 @@ entry:
   %yy = zext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2060,7 +2060,7 @@ entry:
   %yy = sext <4 x i8> %y to <4 x i32>
   %m = mul <4 x i32> %xx, %yy
   %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
-  %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s)
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
   %r = add i32 %z, %a
   ret i32 %r
 }
@@ -2078,7 +2078,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2096,7 +2096,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i16>
   %m = mul <16 x i16> %xx, %yy
   %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2117,7 +2117,7 @@ entry:
   %yy = zext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2138,7 +2138,7 @@ entry:
   %yy = sext <8 x i8> %y to <8 x i16>
   %m = mul <8 x i16> %xx, %yy
   %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
-  %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s)
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
   %r = add i16 %z, %a
   ret i16 %r
 }
@@ -2154,7 +2154,7 @@ entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
   %m = mul <16 x i8> %x, %y
   %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
-  %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s)
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
   %r = add i8 %z, %a
   ret i8 %r
 }
@@ -2569,7 +2569,7 @@ entry:
   %yy = zext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2917,7 +2917,7 @@ entry:
   %yy = sext <16 x i8> %y to <16 x i64>
   %m = mul <16 x i64> %xx, %yy
   %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -2974,7 +2974,7 @@ entry:
   %yy = zext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -3030,7 +3030,7 @@ entry:
   %yy = sext <2 x i8> %y to <2 x i64>
   %m = mul <2 x i64> %xx, %yy
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
@@ -3088,18 +3088,18 @@ entry:
   %c = icmp eq <2 x i64> %b, zeroinitializer
   %m = mul <2 x i64> %x, %y
   %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
-  %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s)
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
   %r = add i64 %z, %a
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
index 7510169fbbd5..4ff682126ada 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
@@ -9,7 +9,7 @@ define arm_aapcs_vfpcc i32 @mul_v2i32(<2 x i32> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %x)
   ret i32 %z
 }
 
@@ -25,7 +25,7 @@ define arm_aapcs_vfpcc i32 @mul_v4i32(<4 x i32> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %x)
   ret i32 %z
 }
 
@@ -42,7 +42,7 @@ define arm_aapcs_vfpcc i32 @mul_v8i32(<8 x i32> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %x)
   ret i32 %z
 }
 
@@ -58,7 +58,7 @@ define arm_aapcs_vfpcc i16 @mul_v4i16(<4 x i16> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %x)
   ret i16 %z
 }
 
@@ -76,7 +76,7 @@ define arm_aapcs_vfpcc i16 @mul_v8i16(<8 x i16> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
@@ -95,7 +95,7 @@ define arm_aapcs_vfpcc i16 @mul_v16i16(<16 x i16> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %x)
   ret i16 %z
 }
 
@@ -113,7 +113,7 @@ define arm_aapcs_vfpcc i8 @mul_v8i8(<8 x i8> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %x)
   ret i8 %z
 }
 
@@ -133,7 +133,7 @@ define arm_aapcs_vfpcc i8 @mul_v16i8(<16 x i8> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
@@ -154,7 +154,7 @@ define arm_aapcs_vfpcc i8 @mul_v32i8(<32 x i8> %x) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %x)
   ret i8 %z
 }
 
@@ -163,7 +163,7 @@ define arm_aapcs_vfpcc i64 @mul_v1i64(<1 x i64> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %x)
   ret i64 %z
 }
 
@@ -179,7 +179,7 @@ define arm_aapcs_vfpcc i64 @mul_v2i64(<2 x i64> %x) {
 ; CHECK-NEXT:    mla r1, r3, r1, r2
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x)
   ret i64 %z
 }
 
@@ -207,7 +207,7 @@ define arm_aapcs_vfpcc i64 @mul_v4i64(<4 x i64> %x) {
 ; CHECK-NEXT:    mla r1, r1, r6, r4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x)
   ret i64 %z
 }
 
@@ -220,7 +220,7 @@ define arm_aapcs_vfpcc i32 @mul_v2i32_acc(<2 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %x)
   %r = mul i32 %y, %z
   ret i32 %r
 }
@@ -238,7 +238,7 @@ define arm_aapcs_vfpcc i32 @mul_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %x)
   %r = mul i32 %y, %z
   ret i32 %r
 }
@@ -257,7 +257,7 @@ define arm_aapcs_vfpcc i32 @mul_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %x)
+  %z = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %x)
   %r = mul i32 %y, %z
   ret i32 %r
 }
@@ -275,7 +275,7 @@ define arm_aapcs_vfpcc i16 @mul_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %x)
   %r = mul i16 %y, %z
   ret i16 %r
 }
@@ -295,7 +295,7 @@ define arm_aapcs_vfpcc i16 @mul_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %x)
   %r = mul i16 %y, %z
   ret i16 %r
 }
@@ -316,7 +316,7 @@ define arm_aapcs_vfpcc i16 @mul_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %x)
+  %z = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %x)
   %r = mul i16 %y, %z
   ret i16 %r
 }
@@ -336,7 +336,7 @@ define arm_aapcs_vfpcc i8 @mul_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %x)
   %r = mul i8 %y, %z
   ret i8 %r
 }
@@ -358,7 +358,7 @@ define arm_aapcs_vfpcc i8 @mul_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %x)
   %r = mul i8 %y, %z
   ret i8 %r
 }
@@ -381,7 +381,7 @@ define arm_aapcs_vfpcc i8 @mul_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
-  %z = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %x)
+  %z = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %x)
   %r = mul i8 %y, %z
   ret i8 %r
 }
@@ -397,7 +397,7 @@ define arm_aapcs_vfpcc i64 @mul_v1i64_acc(<1 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %x)
   %r = mul i64 %y, %z
   ret i64 %r
 }
@@ -420,7 +420,7 @@ define arm_aapcs_vfpcc i64 @mul_v2i64_acc(<2 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x)
   %r = mul i64 %y, %z
   ret i64 %r
 }
@@ -453,20 +453,20 @@ define arm_aapcs_vfpcc i64 @mul_v4i64_acc(<4 x i64> %x, i64 %y) {
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 entry:
-  %z = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %x)
+  %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x)
   %r = mul i64 %y, %z
   ret i64 %r
 }
 
-declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>)
-declare i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>)
-declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>)
+declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
+declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
index 80c8ae65e4b7..9502716bc98c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
 
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
 
 define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8(<16 x i8> %s1) {
 ; CHECK-LABEL: vmaxv_s_v16i8:
@@ -20,7 +20,7 @@ define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8(<16 x i8> %s1) {
 ; CHECK-NEXT:    mvn r0, #127
 ; CHECK-NEXT:    vmaxv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -31,7 +31,7 @@ define arm_aapcs_vfpcc i16 @vmaxv_s_v8i16(<8 x i16> %s1) {
 ; CHECK-NEXT:    movt r0, #65535
 ; CHECK-NEXT:    vmaxv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -41,7 +41,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v4i32(<4 x i32> %s1) {
 ; CHECK-NEXT:    mov.w r0, #-2147483648
 ; CHECK-NEXT:    vmaxv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -51,7 +51,7 @@ define arm_aapcs_vfpcc i8 @vmaxv_u_v16i8(<16 x i8> %s1) {
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -61,7 +61,7 @@ define arm_aapcs_vfpcc i16 @vmaxv_u_v8i16(<8 x i16> %s1) {
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -71,7 +71,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v4i32(<4 x i32> %s1) {
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -81,7 +81,7 @@ define arm_aapcs_vfpcc i8 @vminv_s_v16i8(<16 x i8> %s1) {
 ; CHECK-NEXT:    movs r0, #127
 ; CHECK-NEXT:    vminv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -91,7 +91,7 @@ define arm_aapcs_vfpcc i16 @vminv_s_v8i16(<8 x i16> %s1) {
 ; CHECK-NEXT:    movw r0, #32767
 ; CHECK-NEXT:    vminv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -101,7 +101,7 @@ define arm_aapcs_vfpcc i32 @vminv_s_v4i32(<4 x i32> %s1) {
 ; CHECK-NEXT:    mvn r0, #-2147483648
 ; CHECK-NEXT:    vminv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -111,7 +111,7 @@ define arm_aapcs_vfpcc i8 @vminv_u_v16i8(<16 x i8> %s1) {
 ; CHECK-NEXT:    movs r0, #255
 ; CHECK-NEXT:    vminv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1)
   ret i8 %r
 }
 
@@ -121,7 +121,7 @@ define arm_aapcs_vfpcc i16 @vminv_u_v8i16(<8 x i16> %s1) {
 ; CHECK-NEXT:    movw r0, #65535
 ; CHECK-NEXT:    vminv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1)
   ret i16 %r
 }
 
@@ -131,7 +131,7 @@ define arm_aapcs_vfpcc i32 @vminv_u_v4i32(<4 x i32> %s1) {
 ; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    vminv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
 
@@ -142,7 +142,7 @@ define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1)
   %c = icmp sgt i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -157,7 +157,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1)
   %rs = sext i8 %r to i32
   %c = icmp sgt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -169,7 +169,7 @@ define arm_aapcs_vfpcc i16 @vmaxv_s_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1)
   %c = icmp sgt i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -185,7 +185,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1)
   %rs = sext i16 %r to i32
   %c = icmp sgt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -197,7 +197,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %s1)
   %c = icmp sgt i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s
@@ -208,7 +208,7 @@ define arm_aapcs_vfpcc i8 @vmaxv_u_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1)
   %c = icmp ugt i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -223,7 +223,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1)
   %rs = zext i8 %r to i32
   %c = icmp ugt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -235,7 +235,7 @@ define arm_aapcs_vfpcc i16 @vmaxv_u_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1)
   %c = icmp ugt i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -250,7 +250,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1)
   %rs = zext i16 %r to i32
   %c = icmp ugt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -262,7 +262,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmaxv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %s1)
   %c = icmp ugt i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s
@@ -273,7 +273,7 @@ define arm_aapcs_vfpcc i8 @vminv_s_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.s8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1)
   %c = icmp slt i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -288,7 +288,7 @@ define arm_aapcs_vfpcc i32 @vminv_s_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1)
   %rs = sext i8 %r to i32
   %c = icmp slt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -300,7 +300,7 @@ define arm_aapcs_vfpcc i16 @vminv_s_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.s16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1)
   %c = icmp slt i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -315,7 +315,7 @@ define arm_aapcs_vfpcc i32 @vminv_s_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1)
   %rs = sext i16 %r to i32
   %c = icmp slt i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -327,7 +327,7 @@ define arm_aapcs_vfpcc i32 @vminv_s_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.s32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %s1)
   %c = icmp slt i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s
@@ -338,7 +338,7 @@ define arm_aapcs_vfpcc i8 @vminv_u_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.u8 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1)
   %c = icmp ult i8 %r, %s2
   %s = select i1 %c, i8 %r, i8 %s2
   ret i8 %s
@@ -353,7 +353,7 @@ define arm_aapcs_vfpcc i32 @vminv_u_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
-  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1)
   %rs = zext i8 %r to i32
   %c = icmp ult i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -365,7 +365,7 @@ define arm_aapcs_vfpcc i16 @vminv_u_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.u16 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1)
   %c = icmp ult i16 %r, %s2
   %s = select i1 %c, i16 %r, i16 %s2
   ret i16 %s
@@ -380,7 +380,7 @@ define arm_aapcs_vfpcc i32 @vminv_u_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
-  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1)
   %rs = zext i16 %r to i32
   %c = icmp ult i32 %rs, %s2
   %s = select i1 %c, i32 %rs, i32 %s2
@@ -392,7 +392,7 @@ define arm_aapcs_vfpcc i32 @vminv_u_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vminv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
-  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1)
+  %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %s1)
   %c = icmp ult i32 %r, %s2
   %s = select i1 %c, i32 %r, i32 %s2
   ret i32 %s

diff  --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index 2ea26b45e871..496d2367445b 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1628,8 +1628,8 @@ define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1,
 ; Repeat tests from general reductions to verify output for hoppy targets:
 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
 
 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
@@ -1672,7 +1672,7 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
+  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %r
 }
 
@@ -1711,7 +1711,7 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
 ; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
+  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %r
 }
 

diff  --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
index 36b2de07bcc8..36bbdd4265b1 100644
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
 
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
 
 define i1 @parseHeaders(i64 * %ptr) nounwind {
 ; SSE2-LABEL: parseHeaders:
@@ -34,7 +34,7 @@ define i1 @parseHeaders(i64 * %ptr) nounwind {
 ; AVX-NEXT:    retq
   %vptr = bitcast i64 * %ptr to <2 x i64> *
   %vload = load <2 x i64>, <2 x i64> * %vptr, align 8
-  %vreduce = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %vload)
+  %vreduce = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %vload)
   %vcheck = icmp eq i64 %vreduce, 0
   ret i1 %vcheck
 }

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index 92f97d0469cd..3ddaf62fe136 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -32,7 +32,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -74,7 +74,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -124,7 +124,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -187,7 +187,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -229,7 +229,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -276,7 +276,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -336,7 +336,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -408,7 +408,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -499,7 +499,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -547,7 +547,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -601,7 +601,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -663,7 +663,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -738,7 +738,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -826,7 +826,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -933,7 +933,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -966,7 +966,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1002,7 +1002,7 @@ define i8 @test_v2i8_load(<2 x i8>* %p) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
   %a0 = load <2 x i8>, <2 x i8>* %p
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1043,7 +1043,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1075,7 +1075,7 @@ define i8 @test_v4i8_load(<4 x i8>* %p) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
   %a0 = load <4 x i8>, <4 x i8>* %p
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1103,7 +1103,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1135,7 +1135,7 @@ define i8 @test_v8i8_load(<8 x i8>* %p) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
   %a0 = load <8 x i8>, <8 x i8>* %p
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1169,7 +1169,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1223,7 +1223,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1285,7 +1285,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1360,32 +1360,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 3c5e46ec9643..e6b895692cc0 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -59,7 +59,7 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -111,7 +111,7 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -176,7 +176,7 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i8> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -205,7 +205,7 @@ define i1 @trunc_v16i8_v16i1(<16 x i8>) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i8> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -262,7 +262,7 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i64> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -351,7 +351,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i32> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -420,7 +420,7 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i16> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -492,7 +492,7 @@ define i1 @trunc_v32i8_v32i1(<32 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i8> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -597,7 +597,7 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i64> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -678,7 +678,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) {
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i32> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -765,7 +765,7 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i16> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -845,7 +845,7 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <64 x i8> %0 to <64 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
@@ -905,7 +905,7 @@ define i1 @icmp_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -961,7 +961,7 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1014,7 +1014,7 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
 ; AVX512VL-NEXT:    sete %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1062,7 +1062,7 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>) {
 ; AVX512VL-NEXT:    setb %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1141,7 +1141,7 @@ define i1 @icmp_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1207,7 +1207,7 @@ define i1 @icmp_v8i32_v8i1(<8 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1274,7 +1274,7 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1354,7 +1354,7 @@ define i1 @icmp_v32i8_v32i1(<32 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1447,7 +1447,7 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1507,7 +1507,7 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) {
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = icmp eq <16 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1595,7 +1595,7 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1686,13 +1686,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <64 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
-declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
index 0df3238e6a2d..64066be4cd2a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
@@ -28,7 +28,7 @@ define i1 @test_v2i64(<2 x i64> %a0) {
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -79,7 +79,7 @@ define i1 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -136,7 +136,7 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -202,7 +202,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -229,7 +229,7 @@ define i1 @test_v2i32(<2 x i32> %a0) {
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -256,7 +256,7 @@ define i1 @test_v4i32(<4 x i32> %a0) {
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -315,7 +315,7 @@ define i1 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -380,7 +380,7 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -454,7 +454,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -482,7 +482,7 @@ define i1 @test_v2i16(<2 x i16> %a0) {
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -510,7 +510,7 @@ define i1 @test_v4i16(<4 x i16> %a0) {
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -542,7 +542,7 @@ define i1 @test_v8i16(<8 x i16> %a0) {
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -610,7 +610,7 @@ define i1 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -684,7 +684,7 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -767,7 +767,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -795,7 +795,7 @@ define i1 @test_v2i8(<2 x i8> %a0) {
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -824,7 +824,7 @@ define i1 @test_v4i8(<4 x i8> %a0) {
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -857,7 +857,7 @@ define i1 @test_v8i8(<8 x i8> %a0) {
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -894,7 +894,7 @@ define i1 @test_v16i8(<16 x i8> %a0) {
 ; AVX-NEXT:    testb %al, %al
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -971,7 +971,7 @@ define i1 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -1054,7 +1054,7 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -1146,33 +1146,33 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
 
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll
index 9545d29cee58..8f7b02fbd100 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll
@@ -24,7 +24,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -66,7 +66,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -114,7 +114,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -171,7 +171,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -193,7 +193,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -215,7 +215,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -265,7 +265,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -321,7 +321,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -386,7 +386,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -411,7 +411,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -436,7 +436,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -465,7 +465,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -528,7 +528,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -597,7 +597,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -675,7 +675,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -700,7 +700,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -726,7 +726,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -756,7 +756,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -790,7 +790,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -862,7 +862,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -940,7 +940,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1027,32 +1027,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
index e98833e3b4c2..ab913980cbe8 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -53,7 +53,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -112,7 +112,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -185,7 +185,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -268,7 +268,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -313,7 +313,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -367,7 +367,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -435,7 +435,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -513,7 +513,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -558,7 +558,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -612,7 +612,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -680,7 +680,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -758,7 +758,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -801,7 +801,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -853,7 +853,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -912,7 +912,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -982,7 +982,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1021,7 +1021,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1069,7 +1069,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1124,7 +1124,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1189,7 +1189,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1228,7 +1228,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1276,7 +1276,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1331,7 +1331,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1396,16 +1396,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
index 7de6a25029e7..ebcf1d38e23b 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
@@ -39,7 +39,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -90,7 +90,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -176,7 +176,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -327,7 +327,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -367,7 +367,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -422,7 +422,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -512,7 +512,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -667,7 +667,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -699,7 +699,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
 }
 
@@ -746,7 +746,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
 }
 
@@ -828,7 +828,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
   ret float %1
 }
 
@@ -975,7 +975,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
   ret float %1
 }
 
@@ -1004,7 +1004,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -1042,7 +1042,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -1101,7 +1101,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -1229,7 +1229,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1261,7 +1261,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1302,7 +1302,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1364,7 +1364,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1467,7 +1467,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1493,7 +1493,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
@@ -1528,7 +1528,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
   ret double %1
 }
 
@@ -1584,7 +1584,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
   ret double %1
 }
 
@@ -1681,16 +1681,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
index 50b88c2c55f5..01b41c7ed7a7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
@@ -39,7 +39,7 @@ define float @test_v2f32(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -78,7 +78,7 @@ define float @test_v4f32(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -125,7 +125,7 @@ define float @test_v8f32(<8 x float> %a0) {
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -179,7 +179,7 @@ define float @test_v16f32(<16 x float> %a0) {
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -206,7 +206,7 @@ define double @test_v2f64(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -236,7 +236,7 @@ define double @test_v4f64(<4 x double> %a0) {
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -271,7 +271,7 @@ define double @test_v8f64(<8 x double> %a0) {
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -313,16 +313,16 @@ define double @test_v16f64(<16 x double> %a0) {
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index f4539c572375..091990a90fbb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -35,7 +35,7 @@ define float @test_v2f32(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -84,7 +84,7 @@ define float @test_v4f32(<4 x float> %a0) {
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -155,7 +155,7 @@ define float @test_v8f32(<8 x float> %a0) {
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -247,7 +247,7 @@ define float @test_v16f32(<16 x float> %a0) {
 ; AVX512-NEXT:    vmaxss %xmm8, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -274,7 +274,7 @@ define double @test_v2f64(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -316,7 +316,7 @@ define double @test_v3f64(<3 x double> %a0) {
 ; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v3f64(<3 x double> %a0)
   ret double %1
 }
 
@@ -350,7 +350,7 @@ define double @test_v4f64(<4 x double> %a0) {
 ; AVX512-NEXT:    vmaxsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -395,7 +395,7 @@ define double @test_v8f64(<8 x double> %a0) {
 ; AVX512-NEXT:    vmaxsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -447,7 +447,7 @@ define double @test_v16f64(<16 x double> %a0) {
 ; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
   ret double %1
 }
 
@@ -511,18 +511,18 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call nnan half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %a0)
+  %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0)
   ret half %1
 }
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v3f64(<3 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
 
-declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
index c5e025be5423..af21b7338f99 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
@@ -14,7 +14,7 @@ define float @test_v1f32(<1 x float> %a0) {
 ; ALL-LABEL: test_v1f32:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a0)
   ret float %1
 }
 
@@ -62,7 +62,7 @@ define float @test_v2f32(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -133,7 +133,7 @@ define float @test_v3f32(<3 x float> %a0) {
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a0)
   ret float %1
 }
 
@@ -230,7 +230,7 @@ define float @test_v4f32(<4 x float> %a0) {
 ; AVX512-NEXT:    vmaxss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -401,7 +401,7 @@ define float @test_v8f32(<8 x float> %a0) {
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -661,7 +661,7 @@ define float @test_v16f32(<16 x float> %a0) {
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -700,7 +700,7 @@ define double @test_v2f64(<2 x double> %a0) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -774,7 +774,7 @@ define double @test_v4f64(<4 x double> %a0) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -922,7 +922,7 @@ define double @test_v8f64(<8 x double> %a0) {
 ; AVX512VL-NEXT:    vmovsd %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -1091,18 +1091,18 @@ define double @test_v16f64(<16 x double> %a0) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmax.v3f32(<3 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index 5846f588581d..de7c67c5ca59 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -14,7 +14,7 @@ define float @test_v1f32(<1 x float> %a0) {
 ; ALL-LABEL: test_v1f32:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a0)
   ret float %1
 }
 
@@ -43,7 +43,7 @@ define float @test_v2f32(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -84,7 +84,7 @@ define float @test_v3f32(<3 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vminss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a0)
   ret float %1
 }
 
@@ -133,7 +133,7 @@ define float @test_v4f32(<4 x float> %a0) {
 ; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -204,7 +204,7 @@ define float @test_v8f32(<8 x float> %a0) {
 ; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -296,7 +296,7 @@ define float @test_v16f32(<16 x float> %a0) {
 ; AVX512-NEXT:    vminss %xmm8, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0)
+  %1 = call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -323,7 +323,7 @@ define double @test_v2f64(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -357,7 +357,7 @@ define double @test_v4f64(<4 x double> %a0) {
 ; AVX512-NEXT:    vminsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -402,7 +402,7 @@ define double @test_v8f64(<8 x double> %a0) {
 ; AVX512-NEXT:    vminsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -454,7 +454,7 @@ define double @test_v16f64(<16 x double> %a0) {
 ; AVX512-NEXT:    vminsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0)
+  %1 = call nnan double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a0)
   ret double %1
 }
 
@@ -518,20 +518,20 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call nnan half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %a0)
+  %1 = call nnan half @llvm.vector.reduce.fmin.v2f16(<2 x half> %a0)
   ret half %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v3f32(<3 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
 
-declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)
+declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
index 1d7436eaa8a4..10e58428d375 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -54,7 +54,7 @@ define float @test_v2f32(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0)
   ret float %1
 }
 
@@ -151,7 +151,7 @@ define float @test_v4f32(<4 x float> %a0) {
 ; AVX512-NEXT:    vminss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a0)
   ret float %1
 }
 
@@ -322,7 +322,7 @@ define float @test_v8f32(<8 x float> %a0) {
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
   ret float %1
 }
 
@@ -582,7 +582,7 @@ define float @test_v16f32(<16 x float> %a0) {
 ; AVX512VL-NEXT:    vmovss %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0)
   ret float %1
 }
 
@@ -621,7 +621,7 @@ define double @test_v2f64(<2 x double> %a0) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0)
   ret double %1
 }
 
@@ -691,7 +691,7 @@ define double @test_v3f64(<3 x double> %a0) {
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v3f64(<3 x double> %a0)
   ret double %1
 }
 
@@ -765,7 +765,7 @@ define double @test_v4f64(<4 x double> %a0) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0)
   ret double %1
 }
 
@@ -913,7 +913,7 @@ define double @test_v8f64(<8 x double> %a0) {
 ; AVX512VL-NEXT:    vmovsd %xmm8, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
   ret double %1
 }
 
@@ -1082,17 +1082,17 @@ define double @test_v16f64(<16 x double> %a0) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
-declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>)
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
 
-declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
-declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v3f64(<3 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
index e99946edeee8..7840ae8ceed6 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
@@ -39,7 +39,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -82,7 +82,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -133,7 +133,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -191,7 +191,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -225,7 +225,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -265,7 +265,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -313,7 +313,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -368,7 +368,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -402,7 +402,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -442,7 +442,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -490,7 +490,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -545,7 +545,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -575,7 +575,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -608,7 +608,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -646,7 +646,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -691,7 +691,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -719,7 +719,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -750,7 +750,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -786,7 +786,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -828,7 +828,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -856,7 +856,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -887,7 +887,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -923,7 +923,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -965,16 +965,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
index 7a1c044a0042..3693ef07aecf 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
@@ -38,7 +38,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
 }
 
@@ -89,7 +89,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
 }
 
@@ -175,7 +175,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %1
 }
 
@@ -326,7 +326,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
   ret float %1
 }
 
@@ -360,7 +360,7 @@ define float @test_v2f32_one(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
   ret float %1
 }
 
@@ -407,7 +407,7 @@ define float @test_v4f32_one(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
   ret float %1
 }
 
@@ -489,7 +489,7 @@ define float @test_v8f32_one(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
   ret float %1
 }
 
@@ -636,7 +636,7 @@ define float @test_v16f32_one(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
   ret float %1
 }
 
@@ -668,7 +668,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
 }
 
@@ -715,7 +715,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
 }
 
@@ -797,7 +797,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
   ret float %1
 }
 
@@ -944,7 +944,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
   ret float %1
 }
 
@@ -973,7 +973,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
@@ -1011,7 +1011,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %1
 }
 
@@ -1070,7 +1070,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
   ret double %1
 }
 
@@ -1198,7 +1198,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
   ret double %1
 }
 
@@ -1226,7 +1226,7 @@ define double @test_v2f64_one(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
   ret double %1
 }
 
@@ -1263,7 +1263,7 @@ define double @test_v4f64_one(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
   ret double %1
 }
 
@@ -1321,7 +1321,7 @@ define double @test_v8f64_one(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
   ret double %1
 }
 
@@ -1419,7 +1419,7 @@ define double @test_v16f64_one(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
   ret double %1
 }
 
@@ -1445,7 +1445,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
@@ -1480,7 +1480,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
   ret double %1
 }
 
@@ -1536,7 +1536,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
   ret double %1
 }
 
@@ -1633,16 +1633,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
   ret double %1
 }
 
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
 
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index 17671c6a5591..d6cac398dfc0 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -85,7 +85,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -231,7 +231,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -443,7 +443,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -763,7 +763,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -799,7 +799,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -841,7 +841,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -905,7 +905,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -983,7 +983,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1086,7 +1086,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1119,7 +1119,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1154,7 +1154,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1195,7 +1195,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1258,7 +1258,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1380,7 +1380,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1522,7 +1522,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1555,7 +1555,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1607,7 +1607,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1668,7 +1668,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1842,7 +1842,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -2051,7 +2051,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -2325,7 +2325,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2685,32 +2685,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index a497c83d2785..a5458d5264bb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -57,7 +57,7 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -107,7 +107,7 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -169,7 +169,7 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i8> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -198,7 +198,7 @@ define i1 @trunc_v16i8_v16i1(<16 x i8>) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i8> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -253,7 +253,7 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i64> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -338,7 +338,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i32> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -407,7 +407,7 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i16> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -479,7 +479,7 @@ define i1 @trunc_v32i8_v32i1(<32 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i8> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -580,7 +580,7 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i64> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -661,7 +661,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) {
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i32> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -748,7 +748,7 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i16> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -828,7 +828,7 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <64 x i8> %0 to <64 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
@@ -894,7 +894,7 @@ define i1 @icmp_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -945,7 +945,7 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -998,7 +998,7 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1046,7 +1046,7 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>) {
 ; AVX512VL-NEXT:    setne %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1129,7 +1129,7 @@ define i1 @icmp_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1197,7 +1197,7 @@ define i1 @icmp_v8i32_v8i1(<8 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1264,7 +1264,7 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1341,7 +1341,7 @@ define i1 @icmp_v32i8_v32i1(<32 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1441,7 +1441,7 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1499,7 +1499,7 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) {
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a = icmp eq <16 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1587,7 +1587,7 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1680,13 +1680,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <64 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
-declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index e1253975d5a6..9cc38d8ced0b 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -31,7 +31,7 @@ define i1 @test_v2i64(<2 x i64> %a0) {
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -60,7 +60,7 @@ define i1 @test_v4i64(<4 x i64> %a0) {
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -111,7 +111,7 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a0)
   %2 = icmp eq i64 %1, 0
   ret i1 %2
 }
@@ -175,7 +175,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a0)
   %2 = icmp ne i64 %1, 0
   ret i1 %2
 }
@@ -198,7 +198,7 @@ define i1 @test_v2i32(<2 x i32> %a0) {
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -224,7 +224,7 @@ define i1 @test_v4i32(<4 x i32> %a0) {
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -253,7 +253,7 @@ define i1 @test_v8i32(<8 x i32> %a0) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -304,7 +304,7 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a0)
   %2 = icmp ne i32 %1, 0
   ret i1 %2
 }
@@ -368,7 +368,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %a0)
   %2 = icmp eq i32 %1, 0
   ret i1 %2
 }
@@ -391,7 +391,7 @@ define i1 @test_v2i16(<2 x i16> %a0) {
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -410,7 +410,7 @@ define i1 @test_v4i16(<4 x i16> %a0) {
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -436,7 +436,7 @@ define i1 @test_v8i16(<8 x i16> %a0) {
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -465,7 +465,7 @@ define i1 @test_v16i16(<16 x i16> %a0) {
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -516,7 +516,7 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %a0)
   %2 = icmp eq i16 %1, 0
   ret i1 %2
 }
@@ -580,7 +580,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %a0)
   %2 = icmp ne i16 %1, 0
   ret i1 %2
 }
@@ -603,7 +603,7 @@ define i1 @test_v2i8(<2 x i8> %a0) {
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -622,7 +622,7 @@ define i1 @test_v4i8(<4 x i8> %a0) {
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -641,7 +641,7 @@ define i1 @test_v8i8(<8 x i8> %a0) {
 ; AVX-NEXT:    testq %rax, %rax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -667,7 +667,7 @@ define i1 @test_v16i8(<16 x i8> %a0) {
 ; AVX-NEXT:    vptest %xmm0, %xmm0
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -696,7 +696,7 @@ define i1 @test_v32i8(<32 x i8> %a0) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -747,7 +747,7 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %a0)
   %2 = icmp ne i8 %1, 0
   ret i1 %2
 }
@@ -811,7 +811,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0)
   %2 = icmp eq i8 %1, 0
   ret i1 %2
 }
@@ -841,7 +841,7 @@ define i1 @trunc_v2i64(<2 x i64> %a0) {
 ; AVX-NEXT:    vptest {{.*}}(%rip), %xmm0
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0)
   %2 = trunc i64 %1 to i16
   %3 = icmp eq i16 %2, 0
   ret i1 %3
@@ -888,7 +888,7 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = and i32 %1, 2147483648
   %3 = icmp eq i32 %2, 0
   ret i1 %3
@@ -935,7 +935,7 @@ define i1 @trunc_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0)
   %2 = trunc i16 %1 to i8
   %3 = icmp ne i8 %2, 0
   ret i1 %3
@@ -1003,7 +1003,7 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0)
   %2 = and i8 %1, 1
   %3 = icmp eq i8 %2, 0
   ret i1 %3
@@ -1037,34 +1037,34 @@ define zeroext i1 @PR44781(%struct.Box* %0) {
 ; AVX-NEXT:    retq
   %2 = bitcast %struct.Box* %0 to <4 x i32>*
   %3 = load <4 x i32>, <4 x i32>* %2, align 4
-  %4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %3)
+  %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %3)
   %5 = and i32 %4, 15
   %6 = icmp eq i32 %5, 0
   ret i1 %6
 }
 
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll
index 35193e9feadc..63c93f375582 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll
@@ -24,7 +24,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -66,7 +66,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -114,7 +114,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -171,7 +171,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -193,7 +193,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -215,7 +215,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -265,7 +265,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -321,7 +321,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -386,7 +386,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -411,7 +411,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -436,7 +436,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -465,7 +465,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -528,7 +528,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -597,7 +597,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -675,7 +675,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -700,7 +700,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -726,7 +726,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -756,7 +756,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -790,7 +790,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -862,7 +862,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -940,7 +940,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1027,32 +1027,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
index 89354e6b2d9f..d4c549250852 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
@@ -83,7 +83,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -209,7 +209,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -404,7 +404,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -731,7 +731,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -771,7 +771,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -819,7 +819,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -891,7 +891,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -981,7 +981,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1104,7 +1104,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1137,7 +1137,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1172,7 +1172,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1216,7 +1216,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512-NEXT:    xorl $32767, %eax # imm = 0x7FFF
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1280,7 +1280,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1354,7 +1354,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1445,7 +1445,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1491,7 +1491,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1547,7 +1547,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1615,7 +1615,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1685,7 +1685,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX512-NEXT:    xorb $127, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1781,7 +1781,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1895,7 +1895,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2042,32 +2042,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
index 106888b99c28..9543db9e950b 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
@@ -83,7 +83,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -209,7 +209,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -404,7 +404,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -731,7 +731,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -771,7 +771,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -819,7 +819,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -891,7 +891,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -981,7 +981,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1104,7 +1104,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1137,7 +1137,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1172,7 +1172,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1216,7 +1216,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512-NEXT:    xorl $32768, %eax # imm = 0x8000
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1280,7 +1280,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1354,7 +1354,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1445,7 +1445,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1491,7 +1491,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1547,7 +1547,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1615,7 +1615,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1685,7 +1685,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX512-NEXT:    xorb $-128, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1781,7 +1781,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1895,7 +1895,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2042,32 +2042,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index aedde4d11e06..27bf159b0c8c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -89,7 +89,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX512VL-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -231,7 +231,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -453,7 +453,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -832,7 +832,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -875,7 +875,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX512-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -929,7 +929,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -1010,7 +1010,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -1115,7 +1115,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1265,7 +1265,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1311,7 +1311,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1361,7 +1361,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1422,7 +1422,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512VL-NEXT:    notl %eax
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1505,7 +1505,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1602,7 +1602,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1721,7 +1721,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1754,7 +1754,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1790,7 +1790,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1832,7 +1832,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1901,7 +1901,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX512VL-NEXT:    notb %al
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1994,7 +1994,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -2099,7 +2099,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2222,32 +2222,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index c8195d241294..dee8970b96a5 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -89,7 +89,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX512VL-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -232,7 +232,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -456,7 +456,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -836,7 +836,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -879,7 +879,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX512-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -933,7 +933,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -1014,7 +1014,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -1119,7 +1119,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -1269,7 +1269,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -1315,7 +1315,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -1365,7 +1365,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -1407,7 +1407,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -1467,7 +1467,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -1539,7 +1539,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -1632,7 +1632,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -1665,7 +1665,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -1701,7 +1701,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -1743,7 +1743,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -1791,7 +1791,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -1859,7 +1859,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -1937,7 +1937,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -2032,32 +2032,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 5b0109bcd4eb..efb45ee36f9f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -57,7 +57,7 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -107,7 +107,7 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -172,7 +172,7 @@ define i1 @trunc_v8i16_v8i1(<8 x i8>) {
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i8> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -201,7 +201,7 @@ define i1 @trunc_v16i8_v16i1(<16 x i8>) {
 ; AVX512-NEXT:    setnp %al
 ; AVX512-NEXT:    retq
   %a = trunc <16 x i8> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -256,7 +256,7 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i64> %0 to <4 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -345,7 +345,7 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i32> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -423,7 +423,7 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i16> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -504,7 +504,7 @@ define i1 @trunc_v32i8_v32i1(<32 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i8> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -609,7 +609,7 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <8 x i64> %0 to <8 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -717,7 +717,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <16 x i32> %0 to <16 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -815,7 +815,7 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <32 x i16> %0 to <32 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -912,7 +912,7 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = trunc <64 x i8> %0 to <64 x i1>
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
@@ -978,7 +978,7 @@ define i1 @icmp_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
   ret i1 %b
 }
 
@@ -1029,7 +1029,7 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1082,7 +1082,7 @@ define i1 @icmp_v8i16_v8i1(<8 x i8>) {
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1136,7 +1136,7 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>) {
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1219,7 +1219,7 @@ define i1 @icmp_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
   ret i1 %b
 }
 
@@ -1288,7 +1288,7 @@ define i1 @icmp_v8i32_v8i1(<8 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1366,7 +1366,7 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1454,7 +1454,7 @@ define i1 @icmp_v32i8_v32i1(<32 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1557,7 +1557,7 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <8 x i64> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
   ret i1 %b
 }
 
@@ -1643,7 +1643,7 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i32> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
   ret i1 %b
 }
 
@@ -1743,7 +1743,7 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <32 x i16> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
   ret i1 %b
 }
 
@@ -1853,13 +1853,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>) {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <64 x i8> %0, zeroinitializer
-  %b = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> %a)
+  %b = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> %a)
   ret i1 %b
 }
 
-declare i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1>)
-declare i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1>)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>)
+declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
index 4fd84897445c..b1306c7dabbe 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
@@ -24,7 +24,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a0)
   ret i64 %1
 }
 
@@ -66,7 +66,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %a0)
   ret i64 %1
 }
 
@@ -114,7 +114,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %a0)
   ret i64 %1
 }
 
@@ -171,7 +171,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> %a0)
+  %1 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %a0)
   ret i64 %1
 }
 
@@ -193,7 +193,7 @@ define i32 @test_v2i32(<2 x i32> %a0) {
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a0)
   ret i32 %1
 }
 
@@ -215,7 +215,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a0)
   ret i32 %1
 }
 
@@ -265,7 +265,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a0)
   ret i32 %1
 }
 
@@ -321,7 +321,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %a0)
   ret i32 %1
 }
 
@@ -386,7 +386,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> %a0)
+  %1 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %a0)
   ret i32 %1
 }
 
@@ -411,7 +411,7 @@ define i16 @test_v2i16(<2 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a0)
   ret i16 %1
 }
 
@@ -436,7 +436,7 @@ define i16 @test_v4i16(<4 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a0)
   ret i16 %1
 }
 
@@ -465,7 +465,7 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a0)
   ret i16 %1
 }
 
@@ -528,7 +528,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %a0)
   ret i16 %1
 }
 
@@ -597,7 +597,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %a0)
   ret i16 %1
 }
 
@@ -675,7 +675,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> %a0)
+  %1 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %a0)
   ret i16 %1
 }
 
@@ -700,7 +700,7 @@ define i8 @test_v2i8(<2 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %a0)
   ret i8 %1
 }
 
@@ -726,7 +726,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a0)
   ret i8 %1
 }
 
@@ -756,7 +756,7 @@ define i8 @test_v8i8(<8 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a0)
   ret i8 %1
 }
 
@@ -790,7 +790,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a0)
   ret i8 %1
 }
 
@@ -862,7 +862,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %a0)
   ret i8 %1
 }
 
@@ -940,7 +940,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %a0)
   ret i8 %1
 }
 
@@ -1027,32 +1027,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> %a0)
+  %1 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %a0)
   ret i8 %1
 }
 
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
 
-declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
 
-declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
 
-declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
similarity index 66%
rename from llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll
rename to llvm/test/Instrumentation/MemorySanitizer/reduce.ll
index e81333b515e2..2629756644e5 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
@@ -5,9 +5,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare i32 @llvm.experimental.vector.reduce.add(<3 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and(<3 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or(<3 x i32>)
+declare i32 @llvm.vector.reduce.add(<3 x i32>)
+declare i32 @llvm.vector.reduce.and(<3 x i32>)
+declare i32 @llvm.vector.reduce.or(<3 x i32>)
 
 ; CHECK-LABEL: @reduce_add
 define i32 @reduce_add() sanitize_memory {
@@ -17,9 +17,9 @@ define i32 @reduce_add() sanitize_memory {
   %o = load <3 x i32>, <3 x i32> *%p
 ; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, <3 x i32>*
 ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32*
-; CHECK: [[R_SHADOW:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
-; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> [[O]])
-  %r = call i32 @llvm.experimental.vector.reduce.add(<3 x i32> %o)
+; CHECK: [[R_SHADOW:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[O]])
+  %r = call i32 @llvm.vector.reduce.add(<3 x i32> %o)
 ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
 ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
 ; CHECK: ret i32 [[R]]
@@ -35,11 +35,11 @@ define i32 @reduce_and() sanitize_memory {
 ; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, <3 x i32>*
 ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32*
 ; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[O]], [[O_SHADOW]]
-; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
-; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
+; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
+; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
 ; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]]
-; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O]])
-  %r = call i32 @llvm.experimental.vector.reduce.and(<3 x i32> %o)
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O]])
+  %r = call i32 @llvm.vector.reduce.and(<3 x i32> %o)
 ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
 ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
 ; CHECK: ret i32 [[R]]
@@ -56,11 +56,11 @@ define i32 @reduce_or() sanitize_memory {
 ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32*
 ; CHECK: [[NOT_O:%.*]] = xor <3 x i32> [[O]], <i32 -1, i32 -1, i32 -1>
 ; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[NOT_O]], [[O_SHADOW]]
-; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
-; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
+; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
+; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
 ; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]]
-; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O]])
-  %r = call i32 @llvm.experimental.vector.reduce.or(<3 x i32> %o)
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O]])
+  %r = call i32 @llvm.vector.reduce.or(<3 x i32> %o)
 ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
 ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
 ; CHECK: ret i32 [[R]]

diff  --git a/llvm/test/Transforms/InstCombine/vector-reductions.ll b/llvm/test/Transforms/InstCombine/vector-reductions.ll
index f11307468457..02551c630c8a 100644
--- a/llvm/test/Transforms/InstCombine/vector-reductions.ll
+++ b/llvm/test/Transforms/InstCombine/vector-reductions.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
 declare void @use_f32(float)
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 declare void @use_i32(i32)
 
 define float @
diff _of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_v4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub reassoc nsz <4 x float> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nsz float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub reassoc nsz float [[TMP2]], [[A1:%.*]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r0 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   %r = fsub reassoc nsz float %r0, %r1
   ret float %r
 }
@@ -26,13 +26,13 @@ define float @
diff _of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x flo
 
 define float @
diff _of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_v4f32_fmf(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub nnan ninf nsz float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   %r = fsub ninf nnan nsz float %r0, %r1
   ret float %r
 }
@@ -41,15 +41,15 @@ define float @
diff _of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x
 
 define float @
diff _of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_extra_use1(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
 ; CHECK-NEXT:    call void @use_f32(float [[R0]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub fast float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
   call void @use_f32(float %r0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   %r = fsub fast float %r0, %r1
   ret float %r
 }
@@ -58,14 +58,14 @@ define float @
diff _of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4
 
 define float @
diff _of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_extra_use2(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
 ; CHECK-NEXT:    call void @use_f32(float [[R1]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub fast float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
   call void @use_f32(float %r1)
   %r = fsub fast float %r0, %r1
   ret float %r
@@ -75,13 +75,13 @@ define float @
diff _of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4
 
 define float @
diff _of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1, <8 x float> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_type_mismatch(
-; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fsub fast float [[R0]], [[R1]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
-  %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
-  %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a1, <8 x float> %v1)
+  %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a1, <8 x float> %v1)
   %r = fsub fast float %r0, %r1
   ret float %r
 }
@@ -89,11 +89,11 @@ define float @
diff _of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1,
 define i32 @
diff _of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   %r = sub i32 %r0, %r1
   ret i32 %r
 }
@@ -102,15 +102,15 @@ define i32 @
diff _of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 
 define i32 @
diff _of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_v4i32_extra_use1(
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
 ; CHECK-NEXT:    call void @use_i32(i32 [[R0]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = sub i32 [[R0]], [[R1]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0)
+  %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
   call void @use_i32(i32 %r0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   %r = sub i32 %r0, %r1
   ret i32 %r
 }
@@ -119,14 +119,14 @@ define i32 @
diff _of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) {
 
 define i32 @
diff _of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_v4i32_extra_use2(
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
 ; CHECK-NEXT:    call void @use_i32(i32 [[R1]])
 ; CHECK-NEXT:    [[R:%.*]] = sub i32 [[R0]], [[R1]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   call void @use_i32(i32 %r1)
   %r = sub i32 %r0, %r1
   ret i32 %r
@@ -136,13 +136,13 @@ define i32 @
diff _of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) {
 
 define i32 @
diff _of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @
diff _of_sums_type_mismatch2(
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = sub i32 [[R0]], [[R1]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %r0 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %v0)
-  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
+  %r0 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v0)
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
   %r = sub i32 %r0, %r1
   ret i32 %r
 }

diff  --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index 5f1861e24b62..a2bc86c9ac66 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,31 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
-declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a)
+declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a)
 
 
 define i32 @add_0() {
 ; CHECK-LABEL: @add_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -33,7 +33,7 @@ define i32 @add_1() {
 ; CHECK-LABEL: @add_1(
 ; CHECK-NEXT:    ret i32 8
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -41,7 +41,7 @@ define i32 @add_inc() {
 ; CHECK-LABEL: @add_inc(
 ; CHECK-NEXT:    ret i32 18
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -49,25 +49,25 @@ define i32 @add_1v() {
 ; CHECK-LABEL: @add_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @add_undef() {
 ; CHECK-LABEL: @add_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @add_undef1() {
 ; CHECK-LABEL: @add_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -77,7 +77,7 @@ define i32 @mul_0() {
 ; CHECK-LABEL: @mul_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -85,7 +85,7 @@ define i32 @mul_1() {
 ; CHECK-LABEL: @mul_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -93,7 +93,7 @@ define i32 @mul_inc() {
 ; CHECK-LABEL: @mul_inc(
 ; CHECK-NEXT:    ret i32 40320
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -101,25 +101,25 @@ define i32 @mul_1v() {
 ; CHECK-LABEL: @mul_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @mul_undef() {
 ; CHECK-LABEL: @mul_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @mul_undef1() {
 ; CHECK-LABEL: @mul_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -128,7 +128,7 @@ define i32 @and_0() {
 ; CHECK-LABEL: @and_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -136,7 +136,7 @@ define i32 @and_1() {
 ; CHECK-LABEL: @and_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -144,7 +144,7 @@ define i32 @and_inc() {
 ; CHECK-LABEL: @and_inc(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -152,25 +152,25 @@ define i32 @and_1v() {
 ; CHECK-LABEL: @and_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @and_undef() {
 ; CHECK-LABEL: @and_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @and_undef1() {
 ; CHECK-LABEL: @and_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -179,7 +179,7 @@ define i32 @or_0() {
 ; CHECK-LABEL: @or_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -187,7 +187,7 @@ define i32 @or_1() {
 ; CHECK-LABEL: @or_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -195,7 +195,7 @@ define i32 @or_inc() {
 ; CHECK-LABEL: @or_inc(
 ; CHECK-NEXT:    ret i32 -1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -203,25 +203,25 @@ define i32 @or_1v() {
 ; CHECK-LABEL: @or_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @or_undef() {
 ; CHECK-LABEL: @or_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @or_undef1() {
 ; CHECK-LABEL: @or_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -230,7 +230,7 @@ define i32 @xor_0() {
 ; CHECK-LABEL: @xor_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -238,7 +238,7 @@ define i32 @xor_1() {
 ; CHECK-LABEL: @xor_1(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -246,7 +246,7 @@ define i32 @xor_inc() {
 ; CHECK-LABEL: @xor_inc(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -254,25 +254,25 @@ define i32 @xor_1v() {
 ; CHECK-LABEL: @xor_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @xor_undef() {
 ; CHECK-LABEL: @xor_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @xor_undef1() {
 ; CHECK-LABEL: @xor_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -281,7 +281,7 @@ define i32 @smin_0() {
 ; CHECK-LABEL: @smin_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -289,7 +289,7 @@ define i32 @smin_1() {
 ; CHECK-LABEL: @smin_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -297,7 +297,7 @@ define i32 @smin_inc() {
 ; CHECK-LABEL: @smin_inc(
 ; CHECK-NEXT:    ret i32 -6
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -305,25 +305,25 @@ define i32 @smin_1v() {
 ; CHECK-LABEL: @smin_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @smin_undef() {
 ; CHECK-LABEL: @smin_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @smin_undef1() {
 ; CHECK-LABEL: @smin_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -332,7 +332,7 @@ define i32 @smax_0() {
 ; CHECK-LABEL: @smax_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -340,7 +340,7 @@ define i32 @smax_1() {
 ; CHECK-LABEL: @smax_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -348,7 +348,7 @@ define i32 @smax_inc() {
 ; CHECK-LABEL: @smax_inc(
 ; CHECK-NEXT:    ret i32 8
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -356,25 +356,25 @@ define i32 @smax_1v() {
 ; CHECK-LABEL: @smax_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @smax_undef() {
 ; CHECK-LABEL: @smax_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @smax_undef1() {
 ; CHECK-LABEL: @smax_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -383,7 +383,7 @@ define i32 @umin_0() {
 ; CHECK-LABEL: @umin_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -391,7 +391,7 @@ define i32 @umin_1() {
 ; CHECK-LABEL: @umin_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -399,7 +399,7 @@ define i32 @umin_inc() {
 ; CHECK-LABEL: @umin_inc(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -407,25 +407,25 @@ define i32 @umin_1v() {
 ; CHECK-LABEL: @umin_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @umin_undef() {
 ; CHECK-LABEL: @umin_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @umin_undef1() {
 ; CHECK-LABEL: @umin_undef1(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -434,7 +434,7 @@ define i32 @umax_0() {
 ; CHECK-LABEL: @umax_0(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer)
   ret i32 %x
 }
 
@@ -442,7 +442,7 @@ define i32 @umax_1() {
 ; CHECK-LABEL: @umax_1(
 ; CHECK-NEXT:    ret i32 1
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }
 
@@ -450,7 +450,7 @@ define i32 @umax_inc() {
 ; CHECK-LABEL: @umax_inc(
 ; CHECK-NEXT:    ret i32 -3
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
   ret i32 %x
 }
 
@@ -458,24 +458,24 @@ define i32 @umax_1v() {
 ; CHECK-LABEL: @umax_1v(
 ; CHECK-NEXT:    ret i32 10
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> <i32 10>)
+  %x = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> <i32 10>)
   ret i32 %x
 }
 
 define i32 @umax_undef() {
 ; CHECK-LABEL: @umax_undef(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
   ret i32 %x
 }
 
 define i32 @umax_undef1d() {
 ; CHECK-LABEL: @umax_undef1d(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
-  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret i32 %x
 }

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index eaed8276a9c6..0f2dfaeb55f6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -8,7 +8,7 @@ target triple = "aarch64--linux-gnu"
 ; Function Attrs: norecurse nounwind readonly
 define i32 @fn1() local_unnamed_addr #0 {
 ; Ensure that we don't emit reduction intrinsics for unsupported short reductions.
-; CHECK-NOT: @llvm.experimental.vector.reduce
+; CHECK-NOT: @llvm.vector.reduce
 entry:
   %0 = load i32, i32* @b, align 4, !tbaa !1
   %cmp40 = icmp sgt i32 %0, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
index 50449d76b827..a95c0aa6f375 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -20,7 +20,7 @@ target triple = "aarch64--linux-gnu"
 ; CHECK:   add <16 x i8>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8>
 ; CHECK:   zext i8 [[Rdx]] to i32
 ;
 define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
@@ -75,7 +75,7 @@ for.body:
 ; CHECK:   add <8 x i16>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
 ; CHECK:   zext i16 [[Rdx]] to i32
 ;
 define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {
@@ -132,7 +132,7 @@ for.body:
 ; CHECK:   add <8 x i16>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
 ; CHECK:   zext i16 [[Rdx]] to i32
 ;
 define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index d9254f8d9323..6142b9b0b47f 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -628,7 +628,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index b7dc26eb4556..2cd97c1b416d 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -23,7 +23,7 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -77,7 +77,7 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -132,7 +132,7 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -186,7 +186,7 @@ define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -240,7 +240,7 @@ define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -294,7 +294,7 @@ define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -348,7 +348,7 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -402,7 +402,7 @@ define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -451,7 +451,7 @@ define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -508,7 +508,7 @@ define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index 57b63079ba88..787fe5566422 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -41,7 +41,7 @@ define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture rea
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -129,7 +129,7 @@ define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture read
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP11]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -210,7 +210,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -281,7 +281,7 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -352,7 +352,7 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -423,7 +423,7 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -494,7 +494,7 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -565,7 +565,7 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -636,7 +636,7 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -703,7 +703,7 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -773,7 +773,7 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -843,7 +843,7 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -913,7 +913,7 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index 614d055730d8..267406af706c 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -183,7 +183,7 @@ define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
@@ -232,7 +232,7 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
@@ -282,7 +282,7 @@ define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
@@ -330,7 +330,7 @@ define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
@@ -379,7 +379,7 @@ define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
@@ -427,7 +427,7 @@ define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
@@ -664,7 +664,7 @@ define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y,
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
@@ -720,7 +720,7 @@ define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y,
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -778,7 +778,7 @@ define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -834,7 +834,7 @@ define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture reado
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
@@ -890,7 +890,7 @@ define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP8]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
@@ -946,7 +946,7 @@ define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly %
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP6]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
index cdcb81ec2dc2..075abffb4283 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
@@ -29,8 +29,8 @@ define arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32* nocaptur
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
index 95b22eb9660a..18b130224f98 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -595,7 +595,7 @@ define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -665,7 +665,7 @@ define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -735,7 +735,7 @@ define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -805,7 +805,7 @@ define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index fa6f080a652c..9aa5d586a30e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -110,7 +110,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; AVX:       middle.block:
-; AVX-NEXT:    [[TMP8:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[PREDPHI]])
+; AVX-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[PREDPHI]])
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
 ; AVX:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 1ce437198cb0..be1b7c2d7ae8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -59,7 +59,7 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX12:%.*]] = add <16 x i32> [[TMP13]], [[BIN_RDX11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 900db950ee31..f4a6f04252f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -97,7 +97,7 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
 ; CHECK-NEXT:    [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX19]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -263,7 +263,7 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -450,7 +450,7 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]]
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]]
-; CHECK-NEXT:    [[TMP105:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
+; CHECK-NEXT:    [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -776,7 +776,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]]
 ; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP185:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -953,7 +953,7 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1126,7 +1126,7 @@ define i32 @test_non_zero_start(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 3072, 3072
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1477,7 +1477,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]]
 ; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP153:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 2048, 2048
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1644,7 +1644,7 @@ define i32 @neg_off_by_many(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1811,7 +1811,7 @@ define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1978,7 +1978,7 @@ define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index e71c40419f9f..a833b982e995 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -90,7 +90,7 @@ define i32 @main() local_unnamed_addr #0 {
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP27]], [[TMP26]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP7]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
index a8dc5350c08a..f9a469cd093e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
@@ -27,7 +27,7 @@ define zeroext i8 @sum() {
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
 ; CHECK-NEXT:    ret i8 [[TMP7]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index cb0d0503dcb1..f2498b97ca2b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -76,7 +76,7 @@ define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -148,7 +148,7 @@ define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -220,7 +220,7 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %arra
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc contract float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 0055a66db59d..99acd8e07880 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -62,7 +62,7 @@ define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP37]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 96
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index 044128c8ee74..1017d79792f5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -195,7 +195,7 @@ define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index a4f9ac13664a..fe5cc16314a2 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -27,13 +27,13 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]])
 ; CHECK-NEXT:    [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP13]] = add i32 [[TMP12]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 3994d64f91d3..adcf8f5f0b93 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -15,7 +15,7 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -65,11 +65,11 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -121,7 +121,7 @@ define i32 @reduction_sum_const(i32* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[TMP3]], 12
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
@@ -173,11 +173,11 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP9]] = mul i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -234,9 +234,9 @@ define i32 @reduction_mix(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8]] = add i32 [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -291,9 +291,9 @@ define i32 @reduction_mul(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = mul i32 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -346,7 +346,7 @@ define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* no
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -398,9 +398,9 @@ define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = and i32 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -453,7 +453,7 @@ define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = or i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -506,7 +506,7 @@ define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = xor i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -558,9 +558,9 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = fadd float [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -612,9 +612,9 @@ define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP7]] = fmul float [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -663,7 +663,7 @@ define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
@@ -711,7 +711,7 @@ define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt i32 [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
@@ -765,7 +765,7 @@ define i32 @reduction_sub_lhs(i32* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -827,7 +827,7 @@ define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !32
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -950,11 +950,11 @@ define i32 @reduction_predicated(i32* noalias nocapture %A, i32* noalias nocaptu
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
@@ -1012,7 +1012,7 @@ define i8 @reduction_add_trunc(i8* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !36
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
@@ -1059,7 +1059,7 @@ define i8 @reduction_and_trunc(i8* noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = and i32 [[TMP5]], [[TMP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
index b8a4d16627fe..1d450e4c5259 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
@@ -10,7 +10,7 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
 ; CHECK:         [[TMP24:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP23:%.*]], <4 x i32> zeroinitializer
 ; CHECK:         [[TMP25]] = add <4 x i32> [[VEC_PHI]], [[TMP24]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP27:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
+; CHECK:         [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
 ;
 entry:
   br label %.lr.ph
@@ -38,7 +38,7 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:         [[TMP46:%.*]] = add <4 x i32> [[TMP45]], [[TMP43:%.*]]
 ; CHECK:         [[TMP47]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP49:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP47]])
+; CHECK:         [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP47]])
 ;
 entry:
   br label %.lr.ph
@@ -70,7 +70,7 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:         [[TMP45:%.*]] = mul <4 x i32> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %.lr.ph
@@ -101,7 +101,7 @@ define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK:         [[TMP45:%.*]] = and <4 x i32> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -131,7 +131,7 @@ define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK:         [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer
 ; CHECK:         [[TMP46]] = or <4 x i32> [[VEC_PHI]], [[TMP45]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -161,7 +161,7 @@ define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK:         [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer
 ; CHECK:         [[TMP46]] = xor <4 x i32> [[VEC_PHI]], [[TMP45]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -192,7 +192,7 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
 ; CHECK:         [[TMP45:%.*]] = fadd fast <4 x float> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -223,7 +223,7 @@ define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
 ; CHECK:         [[TMP45:%.*]] = fmul fast <4 x float> [[TMP44]], [[TMP43:%.*]]
 ; CHECK:         [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP46]])
+; CHECK:         [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP46]])
 ;
 entry:
   br label %for.body
@@ -254,7 +254,7 @@ define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK:         [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]
 ; CHECK:         [[TMP26]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP26]])
+; CHECK:         [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP26]])
 ;
 entry:
   br label %for.body
@@ -283,7 +283,7 @@ define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) {
 ; CHECK:         [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]
 ; CHECK:         [[TMP26]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]]
 ; CHECK:       middle.block:
-; CHECK:         [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP26]])
+; CHECK:         [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP26]])
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index cf01ead15b0e..5e7a8dbb7270 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[Z]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[Z]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %z = and <4 x i32> %x, %y
@@ -74,7 +74,7 @@ define i32 @TestVectorsEqual(i32* noalias %Vec0, i32* noalias %Vec1, i32 %Tolera
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP8]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND6:%.*]] = zext i1 [[CMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[COND6]]
@@ -133,7 +133,7 @@ define i32 @TestVectorsEqual_alt(i32* noalias %Vec0, i32* noalias %Vec1, i32 %To
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
@@ -181,7 +181,7 @@ define i32 @TestVectorsEqualFP(float* noalias %Vec0, float* noalias %Vec1, float
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ole float [[TMP6]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND5:%.*]] = zext i1 [[CMP4]] to i32
 ; CHECK-NEXT:    ret i32 [[COND5]]
@@ -240,7 +240,7 @@ define i32 @TestVectorsEqualFP_alt(float* noalias %Vec0, float* noalias %Vec1, f
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index 2bc5469a7d88..4c2d51b188e6 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -23,7 +23,7 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
 ; CHECK-NEXT:    ret i32 [[TMP10]]
 ;
   %tmp00 = lshr i32 %a, 15

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 5f8cbd5c8d01..e46f924852e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -17,7 +17,7 @@ define void @PR28330(i32 %n) {
 ; DEFAULT:       for.body:
 ; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
 ; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
@@ -61,7 +61,7 @@ define void @PR28330(i32 %n) {
 ; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
 ; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
 ; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
 ; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]]
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
@@ -153,7 +153,7 @@ define void @PR32038(i32 %n) {
 ; DEFAULT:       for.body:
 ; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
 ; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
 ; DEFAULT-NEXT:    br label [[FOR_BODY]]
 ;
@@ -197,7 +197,7 @@ define void @PR32038(i32 %n) {
 ; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
 ; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
 ; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
 ; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], -5
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
@@ -229,7 +229,7 @@ define void @PR32038(i32 %n) {
 ; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
 ; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
 ; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index ab07e8c248a0..1f801834add0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -46,7 +46,7 @@ define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias noca
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]]
@@ -169,7 +169,7 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP5]], [[S_020]]
 ; CHECK-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP14]], label [[IF_END]], label [[FOR_END_LOOPEXIT:%.*]]
@@ -285,7 +285,7 @@ define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noali
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP10]], [[S_047]]
 ; CHECK-NEXT:    [[CMP83:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 6d2c2f410ade..ec634a144fcb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -244,7 +244,7 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], <i32 65535, i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
 ; CHECK-NEXT:    ret i32 [[TMP15]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
index fc763b0df246..d42045460812 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -19,7 +19,7 @@ define void @mainTest(i32* %ptr) #0  {
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP10]], 1
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]]
 ; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], [[TMP6]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index 3a6901b51b4f..d4a0710406d3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -20,7 +20,7 @@ define void @test() #0 {
 ; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], <i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP9]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[OP_EXTRA1]] = add i64 [[OP_EXTRA]], [[TMP6]]
 ; CHECK-NEXT:    br label [[LOOP]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 5c692574b292..a38aa8ed4a88 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -11,7 +11,7 @@ define void @Test(i32) {
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
 ; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
@@ -62,7 +62,7 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
 ; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
-; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
+; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
 ; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
 ; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index a1c239c92c8e..ba499e29eef8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -13,7 +13,7 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
 ; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
index b0971dd80450..af12041c07ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -84,7 +84,7 @@ define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
@@ -111,7 +111,7 @@ define float @merge_anyof_v4f32_wrong_last(<4 x float> %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
@@ -138,7 +138,7 @@ define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP1]], 42
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP4]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
@@ -169,7 +169,7 @@ define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
 ; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index dd5d649c41bb..18d9fae436ae 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -102,7 +102,7 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
 ; CHECK-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
@@ -118,7 +118,7 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
 ; THRESHOLD-NEXT:    store float [[OP_EXTRA1]], float* @res, align 4
@@ -175,7 +175,7 @@ define float @bazzz() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    store float [[TMP5]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[TMP5]]
@@ -187,7 +187,7 @@ define float @bazzz() {
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    store float [[TMP5]], float* @res, align 4
 ; THRESHOLD-NEXT:    ret float [[TMP5]]
@@ -223,7 +223,7 @@ define i32 @foo() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; CHECK-NEXT:    store i32 [[CONV4]], i32* @n, align 4
@@ -236,7 +236,7 @@ define i32 @foo() {
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; THRESHOLD-NEXT:    store i32 [[CONV4]], i32* @n, align 4
@@ -390,8 +390,8 @@ define float @f(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
@@ -448,8 +448,8 @@ define float @f(float* nocapture readonly %x) {
 ; THRESHOLD-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
@@ -637,7 +637,7 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA]]
 ;
@@ -678,7 +678,7 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
 ;
@@ -824,10 +824,10 @@ define float @loadadd31(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
@@ -873,10 +873,10 @@ define float @loadadd31(float* nocapture readonly %x) {
 ; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
 ; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
 ; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP7]])
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP5]])
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
 ; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
@@ -990,7 +990,7 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA1]]
@@ -1009,7 +1009,7 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
@@ -1060,7 +1060,7 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
 ; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
@@ -1081,7 +1081,7 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
 ; THRESHOLD-NEXT:    [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
@@ -1138,7 +1138,7 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    ret float [[OP_EXTRA1]]
@@ -1159,7 +1159,7 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
 ; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    ret float [[OP_EXTRA1]]
@@ -1212,7 +1212,7 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
 ; CHECK-NEXT:    ret i32 [[OP_EXTRA1]]
@@ -1231,7 +1231,7 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
 ; THRESHOLD-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
 ; THRESHOLD-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; THRESHOLD-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
 ; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA1]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 9663ede723cc..58056a592c3b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -12,7 +12,7 @@
 define i32 @maxi8(i32) {
 ; CHECK-LABEL: @maxi8(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -43,7 +43,7 @@ define i32 @maxi8(i32) {
 define i32 @maxi16(i32) {
 ; CHECK-LABEL: @maxi16(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -98,7 +98,7 @@ define i32 @maxi16(i32) {
 define i32 @maxi32(i32) {
 ; CHECK-LABEL: @maxi32(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@@ -758,7 +758,7 @@ define i32 @maxi8_mutiple_uses(i32) {
 ; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
 ; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
+; AVX-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
 ; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]]
 ; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
 ; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]]
@@ -776,7 +776,7 @@ define i32 @maxi8_mutiple_uses(i32) {
 ; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; THRESH-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; THRESH-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; THRESH-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
 ; THRESH-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1
 ; THRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0
@@ -860,7 +860,7 @@ define i32 @maxi8_wrong_parent(i32) {
 ; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
+; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
 ; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
 ; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
 ; AVX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
@@ -879,7 +879,7 @@ define i32 @maxi8_wrong_parent(i32) {
 ; THRESH-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
 ; THRESH-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
 ; THRESH-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
+; THRESH-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
 ; THRESH-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
 ; THRESH-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
 ; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
index 703ebbf9dd73..e8e21d4ff732 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -37,7 +37,7 @@ define i32 @add_red(float* %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
@@ -70,7 +70,7 @@ define i32 @add_red(float* %A, i32 %n) {
 ; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
-; STORE-NEXT:    [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; STORE-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; STORE-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
@@ -164,7 +164,7 @@ define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -202,7 +202,7 @@ define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
-; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; STORE-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -326,7 +326,7 @@ define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
 ; CHECK-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
 ; CHECK-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
@@ -383,7 +383,7 @@ define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
 ; STORE-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
 ; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
 ; STORE-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
-; STORE-NEXT:    [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
+; STORE-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP6]])
 ; STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
 ; STORE-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
@@ -520,7 +520,7 @@ define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -558,7 +558,7 @@ define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
-; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; STORE-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
@@ -1015,7 +1015,7 @@ define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]
-; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
 ; STORE-NEXT:    store float [[TMP6]], float* [[C_ADDR_038]], align 4
 ; STORE-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
@@ -1090,7 +1090,7 @@ define void @float_red_example4(float* %res) {
 ; STORE-LABEL: @float_red_example4(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
 ; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1130,7 +1130,7 @@ define void @float_red_example8(float* %res) {
 ; STORE-LABEL: @float_red_example8(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]])
 ; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1194,7 +1194,7 @@ define void @float_red_example16(float* %res) {
 ; STORE-LABEL: @float_red_example16(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
 ; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1250,7 +1250,7 @@ define void @i32_red_example4(i32* %res) {
 ; STORE-LABEL: @i32_red_example4(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1290,7 +1290,7 @@ define void @i32_red_example8(i32* %res) {
 ; STORE-LABEL: @i32_red_example8(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1354,7 +1354,7 @@ define void @i32_red_example16(i32* %res) {
 ; STORE-LABEL: @i32_red_example16(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1466,7 +1466,7 @@ define void @i32_red_example32(i32* %res) {
 ; STORE-LABEL: @i32_red_example32(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
 ; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
@@ -1544,14 +1544,14 @@ define void @i32_red_call(i32 %val) {
 ; CHECK-LABEL: @i32_red_call(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
 ; STORE-LABEL: @i32_red_call(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; STORE-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
 ; STORE-NEXT:    ret void
 ;
@@ -1579,7 +1579,7 @@ define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_
 ; CHECK-LABEL: @i32_red_invoke(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
 ; CHECK-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
 ; CHECK:       exception:
@@ -1592,7 +1592,7 @@ define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_
 ; STORE-LABEL: @i32_red_invoke(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
-; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
+; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
 ; STORE-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
 ; STORE-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
 ; STORE:       exception:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
index 0879e358ad73..470a7d44c03f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
@@ -5,7 +5,7 @@ define signext i8 @Foo(<32 x i8>* %__v) {
 ; CHECK-LABEL: @Foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, <32 x i8>* [[__V:%.*]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> [[TMP0]])
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
index 30ceee8f9e81..bcca68fb38a1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
@@ -80,7 +80,7 @@ define i32 @horiz_max_multiple_uses([32 x i32]* %x, i32* %p) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[T4:%.*]] = load i32, i32* [[X4]], align 4
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, i32* [[X5]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[T4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[T4]]
 ; CHECK-NEXT:    [[C012345:%.*]] = icmp sgt i32 [[TMP5]], [[T5]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index d0bb09a9e24a..12d2355561b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -35,7 +35,7 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
@@ -124,7 +124,7 @@ define i32 @test2(i32* nocapture readonly %p, i32* nocapture readonly %q) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
@@ -230,7 +230,7 @@ define i32 @test3(i32* nocapture readonly %p, i32* nocapture readonly %q) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[REORDER_SHUFFLE]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index 0d5050412f28..6322551facc1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -26,7 +26,7 @@ define i32 @test_add(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -74,7 +74,7 @@ define i32 @test_mul(i32* nocapture readonly %p) {
 ; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
+; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
 ; AVX-NEXT:    ret i32 [[TMP2]]
 ;
 ; SSE-LABEL: @test_mul(
@@ -148,7 +148,7 @@ define i32 @test_and(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -196,7 +196,7 @@ define i32 @test_or(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -244,7 +244,7 @@ define i32 @test_xor(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -284,7 +284,7 @@ define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unna
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
index 518ec13a36fd..149303c4bdc4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -38,7 +38,7 @@ define i32 @foo(i32* %
diff ) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
index 9ed21a1c3f8c..4cbc06ae6035 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -18,14 +18,14 @@ define void @hoge() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
 ; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 15, i32 31, i32 47, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef
 ; CHECK-NEXT:    [[T20:%.*]] = icmp sgt i32 [[T19]], 63
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
index 7898d4112890..e2659dd121fa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
@@ -5,7 +5,7 @@ define float @dotf(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @dotf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
 ; CHECK-NEXT:    ret float [[TMP1]]
 ;
 entry:
@@ -33,7 +33,7 @@ define double @dotd(<4 x double>* byval nocapture readonly align 32, <4 x double
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32
 ; CHECK-NEXT:    [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
 entry:
@@ -63,7 +63,7 @@ define float @dotfq(<4 x float>* nocapture readonly %x, <4 x float>* nocapture r
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
 ; CHECK-NEXT:    ret float [[TMP3]]
 ;
 entry:
@@ -93,7 +93,7 @@ define double @dotdq(<4 x double>* nocapture readonly %x, <4 x double>* nocaptur
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
index d3bf7f1dd498..875fcbc52e6a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -37,7 +37,7 @@ define i32 @foo(i32* nocapture readonly %
diff ) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
index 343f98957a5c..3b5a53e44dec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
@@ -16,7 +16,7 @@ define void @_Z2azv() local_unnamed_addr {
 ; CHECK-NEXT:    [[DOTSROA_RAW_IDX_7:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], undef
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[OP_EXTRA]], undef

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
index 5818bd29b4c6..a7a6e1005e5c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
@@ -55,7 +55,7 @@ define void @n() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt <4 x i32> [[TMP37]], zeroinitializer
 ; CHECK-NEXT:    [[TMP39:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP37]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP38]], <4 x i32> [[TMP39]], <4 x i32> [[TMP37]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP40]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP40]])
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp slt i32 [[TMP41]], [[TMP32]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP41]], i32 [[TMP32]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp slt i32 [[TMP43]], [[B_0]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
index 02e7c5b37f3e..31b73236f5e7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -17,7 +17,7 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
@@ -67,7 +67,7 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
@@ -121,7 +121,7 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:

diff  --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index 3c1319a5c625..9e9237e82b53 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,14 +291,14 @@ class LLVM_OneResultIntrOp<string mnem, list<int> overloadedResults = [],
 
 // LLVM vector reduction over a single vector.
 class LLVM_VectorReduction<string mnem>
-    : LLVM_OneResultIntrOp<"experimental.vector.reduce." # mnem,
+    : LLVM_OneResultIntrOp<"vector.reduce." # mnem,
                            [], [0], [NoSideEffect]>,
       Arguments<(ins LLVM_Type)>;
 
 // LLVM vector reduction over a single vector, with an initial value,
 // and with permission to reassociate the reduction operations.
-class LLVM_VectorReductionV2<string mnem>
-    : LLVM_OpBase<LLVM_Dialect, "intr.experimental.vector.reduce.v2." # mnem,
+class LLVM_VectorReductionAcc<string mnem>
+    : LLVM_OpBase<LLVM_Dialect, "intr.vector.reduce." # mnem,
                   [NoSideEffect]>,
       Results<(outs LLVM_Type:$res)>,
       Arguments<(ins LLVM_Type, LLVM_Type,
@@ -307,7 +307,7 @@ class LLVM_VectorReductionV2<string mnem>
     llvm::Module *module = builder.GetInsertBlock()->getModule();
     llvm::Function *fn = llvm::Intrinsic::getDeclaration(
         module,
-        llvm::Intrinsic::experimental_vector_reduce_v2_}] # mnem # [{,
+        llvm::Intrinsic::vector_reduce_}] # mnem # [{,
         { }] # StrJoin<!listconcat(
             ListIntSubst<LLVM_IntrPatterns.result, [0]>.lst,
             ListIntSubst<LLVM_IntrPatterns.operand, [1]>.lst)>.result # [{

diff  --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 94f6809349f1..b4554cff7e62 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -885,20 +885,20 @@ def LLVM_MemcpyInlineOp : LLVM_ZeroResultIntrOp<"memcpy.inline", [0, 1, 2]>,
 // Vector Reductions.
 //
 
-def LLVM_experimental_vector_reduce_add : LLVM_VectorReduction<"add">;
-def LLVM_experimental_vector_reduce_and : LLVM_VectorReduction<"and">;
-def LLVM_experimental_vector_reduce_mul : LLVM_VectorReduction<"mul">;
-def LLVM_experimental_vector_reduce_fmax : LLVM_VectorReduction<"fmax">;
-def LLVM_experimental_vector_reduce_fmin : LLVM_VectorReduction<"fmin">;
-def LLVM_experimental_vector_reduce_or : LLVM_VectorReduction<"or">;
-def LLVM_experimental_vector_reduce_smax : LLVM_VectorReduction<"smax">;
-def LLVM_experimental_vector_reduce_smin : LLVM_VectorReduction<"smin">;
-def LLVM_experimental_vector_reduce_umax : LLVM_VectorReduction<"umax">;
-def LLVM_experimental_vector_reduce_umin : LLVM_VectorReduction<"umin">;
-def LLVM_experimental_vector_reduce_xor : LLVM_VectorReduction<"xor">;
-
-def LLVM_experimental_vector_reduce_v2_fadd : LLVM_VectorReductionV2<"fadd">;
-def LLVM_experimental_vector_reduce_v2_fmul : LLVM_VectorReductionV2<"fmul">;
+def LLVM_vector_reduce_add : LLVM_VectorReduction<"add">;
+def LLVM_vector_reduce_and : LLVM_VectorReduction<"and">;
+def LLVM_vector_reduce_mul : LLVM_VectorReduction<"mul">;
+def LLVM_vector_reduce_fmax : LLVM_VectorReduction<"fmax">;
+def LLVM_vector_reduce_fmin : LLVM_VectorReduction<"fmin">;
+def LLVM_vector_reduce_or : LLVM_VectorReduction<"or">;
+def LLVM_vector_reduce_smax : LLVM_VectorReduction<"smax">;
+def LLVM_vector_reduce_smin : LLVM_VectorReduction<"smin">;
+def LLVM_vector_reduce_umax : LLVM_VectorReduction<"umax">;
+def LLVM_vector_reduce_umin : LLVM_VectorReduction<"umin">;
+def LLVM_vector_reduce_xor : LLVM_VectorReduction<"xor">;
+
+def LLVM_vector_reduce_fadd : LLVM_VectorReductionAcc<"fadd">;
+def LLVM_vector_reduce_fmul : LLVM_VectorReductionAcc<"fmul">;
 
 //
 // LLVM Matrix operations.

diff  --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
index 137e130c4594..f8fc4dead86f 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -207,7 +207,7 @@ def Vector_ReductionOp :
     Note that these operations are restricted to 1-D vectors to remain
     close to the corresponding LLVM intrinsics:
 
-    http://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics
+    http://llvm.org/docs/LangRef.html#vector-reduction-intrinsics
 
     Example:
 

diff  --git a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir
index eef0c9d51185..dbbcecfa3b61 100644
--- a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir
+++ b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir
@@ -24,61 +24,61 @@ module {
     %12 = llvm.mlir.constant(3 : i64) : !llvm.i64
     %v = llvm.insertelement %3, %11[%12 : !llvm.i64] : !llvm.vec<4 x float>
 
-    %max = "llvm.intr.experimental.vector.reduce.fmax"(%v)
+    %max = "llvm.intr.vector.reduce.fmax"(%v)
         : (!llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%max) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 4
 
-    %min = "llvm.intr.experimental.vector.reduce.fmin"(%v)
+    %min = "llvm.intr.vector.reduce.fmin"(%v)
         : (!llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%min) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 1
 
-    %add1 = "llvm.intr.experimental.vector.reduce.v2.fadd"(%0, %v)
+    %add1 = "llvm.intr.vector.reduce.fadd"(%0, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add1) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 11
 
-    %add1r = "llvm.intr.experimental.vector.reduce.v2.fadd"(%0, %v)
+    %add1r = "llvm.intr.vector.reduce.fadd"(%0, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add1r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 11
 
-    %add2 = "llvm.intr.experimental.vector.reduce.v2.fadd"(%1, %v)
+    %add2 = "llvm.intr.vector.reduce.fadd"(%1, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add2) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 12
 
-    %add2r = "llvm.intr.experimental.vector.reduce.v2.fadd"(%1, %v)
+    %add2r = "llvm.intr.vector.reduce.fadd"(%1, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%add2r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 12
 
-    %mul1 = "llvm.intr.experimental.vector.reduce.v2.fmul"(%0, %v)
+    %mul1 = "llvm.intr.vector.reduce.fmul"(%0, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul1) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 24
 
-    %mul1r = "llvm.intr.experimental.vector.reduce.v2.fmul"(%0, %v)
+    %mul1r = "llvm.intr.vector.reduce.fmul"(%0, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul1r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 24
 
-    %mul2 = "llvm.intr.experimental.vector.reduce.v2.fmul"(%1, %v)
+    %mul2 = "llvm.intr.vector.reduce.fmul"(%1, %v)
         : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul2) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 48
 
-    %mul2r = "llvm.intr.experimental.vector.reduce.v2.fmul"(%1, %v)
+    %mul2r = "llvm.intr.vector.reduce.fmul"(%1, %v)
         {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float
     llvm.call @printF32(%mul2r) : (!llvm.float) -> ()
     llvm.call @printNewline() : () -> ()

diff  --git a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir
index e4d578e36e23..7d8c9736a348 100644
--- a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir
+++ b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir
@@ -24,55 +24,55 @@ module {
     %12 = llvm.mlir.constant(3 : i64) : !llvm.i64
     %v = llvm.insertelement %3, %11[%12 : !llvm.i64] : !llvm.vec<4 x i64>
 
-    %add = "llvm.intr.experimental.vector.reduce.add"(%v)
+    %add = "llvm.intr.vector.reduce.add"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%add) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 10
 
-    %and = "llvm.intr.experimental.vector.reduce.and"(%v)
+    %and = "llvm.intr.vector.reduce.and"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%and) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 0
 
-    %mul = "llvm.intr.experimental.vector.reduce.mul"(%v)
+    %mul = "llvm.intr.vector.reduce.mul"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%mul) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 24
 
-    %or = "llvm.intr.experimental.vector.reduce.or"(%v)
+    %or = "llvm.intr.vector.reduce.or"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%or) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 7
 
-    %smax = "llvm.intr.experimental.vector.reduce.smax"(%v)
+    %smax = "llvm.intr.vector.reduce.smax"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%smax) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 4
 
-    %smin = "llvm.intr.experimental.vector.reduce.smin"(%v)
+    %smin = "llvm.intr.vector.reduce.smin"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%smin) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 1
 
-    %umax = "llvm.intr.experimental.vector.reduce.umax"(%v)
+    %umax = "llvm.intr.vector.reduce.umax"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%umax) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 4
 
-    %umin = "llvm.intr.experimental.vector.reduce.umin"(%v)
+    %umin = "llvm.intr.vector.reduce.umin"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%umin) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()
     // CHECK: 1
 
-    %xor = "llvm.intr.experimental.vector.reduce.xor"(%v)
+    %xor = "llvm.intr.vector.reduce.xor"(%v)
         : (!llvm.vec<4 x i64>) -> !llvm.i64
     llvm.call @printI64(%xor) : (!llvm.i64) -> ()
     llvm.call @printNewline() : () -> ()

diff  --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index e6c0feb070f0..ac94a421903d 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -564,33 +564,33 @@ class VectorReductionOpConversion : public ConvertToLLVMPattern {
     if (eltType.isIntOrIndex()) {
       // Integer reductions: add/mul/min/max/and/or/xor.
       if (kind == "add")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_add>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_add>(
             op, llvmType, operands[0]);
       else if (kind == "mul")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_mul>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_mul>(
             op, llvmType, operands[0]);
       else if (kind == "min" &&
                (eltType.isIndex() || eltType.isUnsignedInteger()))
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_umin>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_umin>(
             op, llvmType, operands[0]);
       else if (kind == "min")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_smin>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_smin>(
             op, llvmType, operands[0]);
       else if (kind == "max" &&
                (eltType.isIndex() || eltType.isUnsignedInteger()))
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_umax>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_umax>(
             op, llvmType, operands[0]);
       else if (kind == "max")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_smax>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_smax>(
             op, llvmType, operands[0]);
       else if (kind == "and")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_and>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_and>(
             op, llvmType, operands[0]);
       else if (kind == "or")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_or>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_or>(
             op, llvmType, operands[0]);
       else if (kind == "xor")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_xor>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_xor>(
             op, llvmType, operands[0]);
       else
         return failure();
@@ -604,7 +604,7 @@ class VectorReductionOpConversion : public ConvertToLLVMPattern {
                                         : rewriter.create<LLVM::ConstantOp>(
                                               op->getLoc(), llvmType,
                                               rewriter.getZeroAttr(eltType));
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_v2_fadd>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fadd>(
             op, llvmType, acc, operands[0],
             rewriter.getBoolAttr(reassociateFPReductions));
       } else if (kind == "mul") {
@@ -614,14 +614,14 @@ class VectorReductionOpConversion : public ConvertToLLVMPattern {
                         : rewriter.create<LLVM::ConstantOp>(
                               op->getLoc(), llvmType,
                               rewriter.getFloatAttr(eltType, 1.0));
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_v2_fmul>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fmul>(
             op, llvmType, acc, operands[0],
             rewriter.getBoolAttr(reassociateFPReductions));
       } else if (kind == "min")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_fmin>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fmin>(
             op, llvmType, operands[0]);
       else if (kind == "max")
-        rewriter.replaceOpWithNewOp<LLVM::experimental_vector_reduce_fmax>(
+        rewriter.replaceOpWithNewOp<LLVM::vector_reduce_fmax>(
             op, llvmType, operands[0]);
       else
         return failure();

diff  --git a/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir
index a2a47b0eff26..70b0a8fadfee 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir
@@ -5,14 +5,14 @@
 // CHECK-LABEL: llvm.func @reduce_add_f32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      CHECK: llvm.return %[[V]] : !llvm.float
 //
 // REASSOC-LABEL: llvm.func @reduce_add_f32(
 // REASSOC-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      REASSOC: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
-//      REASSOC: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      REASSOC: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // REASSOC-SAME: {reassoc = true} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      REASSOC: llvm.return %[[V]] : !llvm.float
 //
@@ -25,14 +25,14 @@ func @reduce_add_f32(%arg0: vector<16xf32>) -> f32 {
 // CHECK-LABEL: llvm.func @reduce_mul_f32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fmul"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fmul"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      CHECK: llvm.return %[[V]] : !llvm.float
 //
 // REASSOC-LABEL: llvm.func @reduce_mul_f32(
 // REASSOC-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      REASSOC: %[[C:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
-//      REASSOC: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fmul"(%[[C]], %[[A]])
+//      REASSOC: %[[V:.*]] = "llvm.intr.vector.reduce.fmul"(%[[C]], %[[A]])
 // REASSOC-SAME: {reassoc = true} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      REASSOC: llvm.return %[[V]] : !llvm.float
 //

diff  --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index eae918897d32..ab4948a56791 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -774,7 +774,7 @@ func @reduce_f16(%arg0: vector<16xf16>) -> f16 {
 // CHECK-LABEL: llvm.func @reduce_f16(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x half>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f16) : !llvm.half
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.half, !llvm.vec<16 x half>) -> !llvm.half
 //      CHECK: llvm.return %[[V]] : !llvm.half
 
@@ -785,7 +785,7 @@ func @reduce_f32(%arg0: vector<16xf32>) -> f32 {
 // CHECK-LABEL: llvm.func @reduce_f32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float
 //      CHECK: llvm.return %[[V]] : !llvm.float
 
@@ -796,7 +796,7 @@ func @reduce_f64(%arg0: vector<16xf64>) -> f64 {
 // CHECK-LABEL: llvm.func @reduce_f64(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x double>)
 //      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f64) : !llvm.double
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
 // CHECK-SAME: {reassoc = false} : (!llvm.double, !llvm.vec<16 x double>) -> !llvm.double
 //      CHECK: llvm.return %[[V]] : !llvm.double
 
@@ -806,7 +806,7 @@ func @reduce_i8(%arg0: vector<16xi8>) -> i8 {
 }
 // CHECK-LABEL: llvm.func @reduce_i8(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i8>)
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: llvm.return %[[V]] : !llvm.i8
 
 func @reduce_i32(%arg0: vector<16xi32>) -> i32 {
@@ -815,7 +815,7 @@ func @reduce_i32(%arg0: vector<16xi32>) -> i32 {
 }
 // CHECK-LABEL: llvm.func @reduce_i32(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i32>)
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: llvm.return %[[V]] : !llvm.i32
 
 func @reduce_i64(%arg0: vector<16xi64>) -> i64 {
@@ -824,7 +824,7 @@ func @reduce_i64(%arg0: vector<16xi64>) -> i64 {
 }
 // CHECK-LABEL: llvm.func @reduce_i64(
 // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i64>)
-//      CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]])
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: llvm.return %[[V]] : !llvm.i64
 
 

diff  --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir
index 8a598e67d17b..7ab440a7e5ca 100644
--- a/mlir/test/Target/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/llvmir-intrinsics.mlir
@@ -182,36 +182,36 @@ llvm.func @smin_test(%arg0: !llvm.i32, %arg1: !llvm.i32, %arg2: !llvm.vec<8 x i3
 
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: !llvm.float, %arg1: !llvm.vec<8 x float>, %arg2: !llvm.vec<8 x i32>) {
-  // CHECK: call i32 @llvm.experimental.vector.reduce.add.v8i32
-  "llvm.intr.experimental.vector.reduce.add"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.and.v8i32
-  "llvm.intr.experimental.vector.reduce.and"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call float @llvm.experimental.vector.reduce.fmax.v8f32
-  "llvm.intr.experimental.vector.reduce.fmax"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call float @llvm.experimental.vector.reduce.fmin.v8f32
-  "llvm.intr.experimental.vector.reduce.fmin"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call i32 @llvm.experimental.vector.reduce.mul.v8i32
-  "llvm.intr.experimental.vector.reduce.mul"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.or.v8i32
-  "llvm.intr.experimental.vector.reduce.or"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.smax.v8i32
-  "llvm.intr.experimental.vector.reduce.smax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.smin.v8i32
-  "llvm.intr.experimental.vector.reduce.smin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.umax.v8i32
-  "llvm.intr.experimental.vector.reduce.umax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call i32 @llvm.experimental.vector.reduce.umin.v8i32
-  "llvm.intr.experimental.vector.reduce.umin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
-  // CHECK: call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fadd"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fmul"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fadd"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32
-  "llvm.intr.experimental.vector.reduce.v2.fmul"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
-  // CHECK: call i32 @llvm.experimental.vector.reduce.xor.v8i32
-  "llvm.intr.experimental.vector.reduce.xor"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.add.v8i32
+  "llvm.intr.vector.reduce.add"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.and.v8i32
+  "llvm.intr.vector.reduce.and"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call float @llvm.vector.reduce.fmax.v8f32
+  "llvm.intr.vector.reduce.fmax"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call float @llvm.vector.reduce.fmin.v8f32
+  "llvm.intr.vector.reduce.fmin"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call i32 @llvm.vector.reduce.mul.v8i32
+  "llvm.intr.vector.reduce.mul"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.or.v8i32
+  "llvm.intr.vector.reduce.or"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.smax.v8i32
+  "llvm.intr.vector.reduce.smax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.smin.v8i32
+  "llvm.intr.vector.reduce.smin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.umax.v8i32
+  "llvm.intr.vector.reduce.umax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call i32 @llvm.vector.reduce.umin.v8i32
+  "llvm.intr.vector.reduce.umin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
+  // CHECK: call float @llvm.vector.reduce.fadd.f32.v8f32
+  "llvm.intr.vector.reduce.fadd"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call float @llvm.vector.reduce.fmul.f32.v8f32
+  "llvm.intr.vector.reduce.fmul"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call reassoc float @llvm.vector.reduce.fadd.f32.v8f32
+  "llvm.intr.vector.reduce.fadd"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call reassoc float @llvm.vector.reduce.fmul.f32.v8f32
+  "llvm.intr.vector.reduce.fmul"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float
+  // CHECK: call i32 @llvm.vector.reduce.xor.v8i32
+  "llvm.intr.vector.reduce.xor"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32
   llvm.return
 }