[llvm] bfa9ce1 - [PowerPC] Improve handling of some BUILD_VECTOR nodes

Nemanja Ivanovic via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 23 15:43:44 PDT 2020


Author: Nemanja Ivanovic
Date: 2020-03-23T17:34:29-05:00
New Revision: bfa9ce1cb27a6abac071c0b8fab76d647098eaeb

URL: https://github.com/llvm/llvm-project/commit/bfa9ce1cb27a6abac071c0b8fab76d647098eaeb
DIFF: https://github.com/llvm/llvm-project/commit/bfa9ce1cb27a6abac071c0b8fab76d647098eaeb.diff

LOG: [PowerPC] Improve handling of some BUILD_VECTOR nodes

An analysis of real world code turned up a number of patterns with BUILD_VECTOR
of nodes resulting from operations on extracted vector elements for which we
produce poor code. This addresses those cases. No attempt is made for
completeness as that would entail a large amount of work for something that
there is no evidence of in real code.

Differential revision: https://reviews.llvm.org/D72660

Added: 
    

Modified: 
    llvm/lib/Target/PowerPC/PPCInstrVSX.td
    llvm/test/CodeGen/PowerPC/build-vector-tests.ll
    llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll
    llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll
    llvm/test/CodeGen/PowerPC/vsx.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index b12096dacdd3..73529533c26b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1341,6 +1341,21 @@ def DWToSPExtractConv {
   dag BVS = (v4f32 (build_vector El0SS1, El1SS1, El0SS2, El1SS2));
 }
 
+def WToDPExtractConv {
+  dag El0S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 0))));
+  dag El1S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 1))));
+  dag El2S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 2))));
+  dag El3S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 3))));
+  dag El0U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 0))));
+  dag El1U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 1))));
+  dag El2U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 2))));
+  dag El3U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 3))));
+  dag BV02S = (v2f64 (build_vector El0S, El2S));
+  dag BV13S = (v2f64 (build_vector El1S, El3S));
+  dag BV02U = (v2f64 (build_vector El0U, El2U));
+  dag BV13U = (v2f64 (build_vector El1U, El3U));
+}
+
 // The following VSX instructions were introduced in Power ISA 2.07
 /* FIXME: if the operands are v2i64, these patterns will not match.
    we should define new patterns or otherwise match the same patterns
@@ -4171,6 +4186,41 @@ let AddedComplexity = 400 in {
     def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
                                    ExtDbl.B0U, ExtDbl.B1U)),
               (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 1))))),
+              (v2f64 (XVCVSPDP (XXMRGHW $A, $A)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 0))))),
+              (v2f64 (XXPERMDI (XVCVSPDP (XXMRGHW $A, $A)),
+                               (XVCVSPDP (XXMRGHW $A, $A)), 2))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 2))))),
+              (v2f64 (XVCVSPDP $A))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 3))))),
+              (v2f64 (XVCVSPDP (XXSLDWI $A, $A, 3)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 2))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 3))))),
+              (v2f64 (XVCVSPDP (XXMRGLW $A, $A)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 2))))),
+              (v2f64 (XXPERMDI (XVCVSPDP (XXMRGLW $A, $A)),
+                               (XVCVSPDP (XXMRGLW $A, $A)), 2))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                                   (f64 (fpextend (extractelt v4f32:$B, 0))))),
+              (v2f64 (XVCVSPDP (XXPERMDI $A, $B, 0)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                                   (f64 (fpextend (extractelt v4f32:$B, 3))))),
+              (v2f64 (XVCVSPDP (XXSLDWI (XXPERMDI $A, $B, 3),
+                                        (XXPERMDI $A, $B, 3), 1)))>;
+    def : Pat<WToDPExtractConv.BV02S,
+              (v2f64 (XVCVSXWDP $A))>;
+    def : Pat<WToDPExtractConv.BV13S,
+              (v2f64 (XVCVSXWDP (XXSLDWI $A, $A, 3)))>;
+    def : Pat<WToDPExtractConv.BV02U,
+              (v2f64 (XVCVUXWDP $A))>;
+    def : Pat<WToDPExtractConv.BV13U,
+              (v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 3)))>;
   }
 
   let Predicates = [IsLittleEndian, HasP8Vector] in {
@@ -4249,6 +4299,41 @@ let AddedComplexity = 400 in {
     def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
                                    ExtDbl.B0U, ExtDbl.B1U)),
               (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 1))))),
+              (v2f64 (XVCVSPDP (XXMRGLW $A, $A)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 0))))),
+              (v2f64 (XXPERMDI (XVCVSPDP (XXMRGLW $A, $A)),
+                               (XVCVSPDP (XXMRGLW $A, $A)), 2))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 2))))),
+              (v2f64 (XVCVSPDP (XXSLDWI $A, $A, 1)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 3))))),
+              (v2f64 (XVCVSPDP $A))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 2))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 3))))),
+              (v2f64 (XVCVSPDP (XXMRGHW $A, $A)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                                   (f64 (fpextend (extractelt v4f32:$A, 2))))),
+              (v2f64 (XXPERMDI (XVCVSPDP (XXMRGHW $A, $A)),
+                               (XVCVSPDP (XXMRGHW $A, $A)), 2))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                                   (f64 (fpextend (extractelt v4f32:$B, 0))))),
+              (v2f64 (XVCVSPDP (XXSLDWI (XXPERMDI $B, $A, 3),
+                                        (XXPERMDI $B, $A, 3), 1)))>;
+    def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                                   (f64 (fpextend (extractelt v4f32:$B, 3))))),
+              (v2f64 (XVCVSPDP (XXPERMDI $B, $A, 0)))>;
+    def : Pat<WToDPExtractConv.BV02S,
+              (v2f64 (XVCVSXWDP (XXSLDWI $A, $A, 1)))>;
+    def : Pat<WToDPExtractConv.BV13S,
+              (v2f64 (XVCVSXWDP $A))>;
+    def : Pat<WToDPExtractConv.BV02U,
+              (v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 1)))>;
+    def : Pat<WToDPExtractConv.BV13U,
+              (v2f64 (XVCVUXWDP $A))>;
   }
 
   let Predicates = [HasDirectMove] in {

diff  --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index 4e096b1c5c03..469cef01094b 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -6123,3 +6123,412 @@ entry:
   %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat.splat
 }
+
+; Some additional patterns that come up in real code.
+define dso_local <2 x double> @sint_to_fp_widen02(<4 x i32> %a) {
+; P9BE-LABEL: sint_to_fp_widen02:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xvcvsxwdp v2, v2
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: sint_to_fp_widen02:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxsldwi vs0, v2, v2, 1
+; P9LE-NEXT:    xvcvsxwdp v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: sint_to_fp_widen02:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xvcvsxwdp v2, v2
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: sint_to_fp_widen02:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxsldwi vs0, v2, v2, 1
+; P8LE-NEXT:    xvcvsxwdp v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 0
+  %conv = sitofp i32 %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %a, i32 2
+  %conv2 = sitofp i32 %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @sint_to_fp_widen13(<4 x i32> %a) {
+; P9BE-LABEL: sint_to_fp_widen13:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; P9BE-NEXT:    xvcvsxwdp v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: sint_to_fp_widen13:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xvcvsxwdp v2, v2
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: sint_to_fp_widen13:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; P8BE-NEXT:    xvcvsxwdp v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: sint_to_fp_widen13:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xvcvsxwdp v2, v2
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 1
+  %conv = sitofp i32 %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %a, i32 3
+  %conv2 = sitofp i32 %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @uint_to_fp_widen02(<4 x i32> %a) {
+; P9BE-LABEL: uint_to_fp_widen02:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xvcvuxwdp v2, v2
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: uint_to_fp_widen02:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxsldwi vs0, v2, v2, 1
+; P9LE-NEXT:    xvcvuxwdp v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: uint_to_fp_widen02:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xvcvuxwdp v2, v2
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: uint_to_fp_widen02:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxsldwi vs0, v2, v2, 1
+; P8LE-NEXT:    xvcvuxwdp v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 0
+  %conv = uitofp i32 %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %a, i32 2
+  %conv2 = uitofp i32 %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @uint_to_fp_widen13(<4 x i32> %a) {
+; P9BE-LABEL: uint_to_fp_widen13:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; P9BE-NEXT:    xvcvuxwdp v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: uint_to_fp_widen13:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xvcvuxwdp v2, v2
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: uint_to_fp_widen13:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; P8BE-NEXT:    xvcvuxwdp v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: uint_to_fp_widen13:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xvcvuxwdp v2, v2
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 1
+  %conv = uitofp i32 %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %a, i32 3
+  %conv2 = uitofp i32 %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend01(<4 x float> %a) {
+; P9BE-LABEL: fp_extend01:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxmrghw vs0, v2, v2
+; P9BE-NEXT:    xvcvspdp v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend01:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxmrglw vs0, v2, v2
+; P9LE-NEXT:    xvcvspdp v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend01:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxmrghw vs0, v2, v2
+; P8BE-NEXT:    xvcvspdp v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend01:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxmrglw vs0, v2, v2
+; P8LE-NEXT:    xvcvspdp v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 0
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend10(<4 x float> %a) {
+; P9BE-LABEL: fp_extend10:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxmrghw vs0, v2, v2
+; P9BE-NEXT:    xvcvspdp vs0, vs0
+; P9BE-NEXT:    xxswapd v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend10:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxmrglw vs0, v2, v2
+; P9LE-NEXT:    xvcvspdp vs0, vs0
+; P9LE-NEXT:    xxswapd v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend10:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxmrghw vs0, v2, v2
+; P8BE-NEXT:    xvcvspdp vs0, vs0
+; P8BE-NEXT:    xxswapd v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend10:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxmrglw vs0, v2, v2
+; P8LE-NEXT:    xvcvspdp vs0, vs0
+; P8LE-NEXT:    xxswapd v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 1
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 0
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend02(<4 x float> %a) {
+; P9BE-LABEL: fp_extend02:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xvcvspdp v2, v2
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend02:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxsldwi vs0, v2, v2, 1
+; P9LE-NEXT:    xvcvspdp v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend02:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xvcvspdp v2, v2
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend02:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxsldwi vs0, v2, v2, 1
+; P8LE-NEXT:    xvcvspdp v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 0
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 2
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend13(<4 x float> %a) {
+; P9BE-LABEL: fp_extend13:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; P9BE-NEXT:    xvcvspdp v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend13:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xvcvspdp v2, v2
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend13:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; P8BE-NEXT:    xvcvspdp v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend13:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xvcvspdp v2, v2
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 1
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 3
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend23(<4 x float> %a) {
+; P9BE-LABEL: fp_extend23:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxmrglw vs0, v2, v2
+; P9BE-NEXT:    xvcvspdp v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend23:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxmrghw vs0, v2, v2
+; P9LE-NEXT:    xvcvspdp v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend23:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxmrglw vs0, v2, v2
+; P8BE-NEXT:    xvcvspdp v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend23:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxmrghw vs0, v2, v2
+; P8LE-NEXT:    xvcvspdp v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 2
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 3
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend32(<4 x float> %a) {
+; P9BE-LABEL: fp_extend32:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxmrglw vs0, v2, v2
+; P9BE-NEXT:    xvcvspdp vs0, vs0
+; P9BE-NEXT:    xxswapd v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend32:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxmrghw vs0, v2, v2
+; P9LE-NEXT:    xvcvspdp vs0, vs0
+; P9LE-NEXT:    xxswapd v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend32:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxmrglw vs0, v2, v2
+; P8BE-NEXT:    xvcvspdp vs0, vs0
+; P8BE-NEXT:    xxswapd v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend32:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxmrghw vs0, v2, v2
+; P8LE-NEXT:    xvcvspdp vs0, vs0
+; P8LE-NEXT:    xxswapd v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 3
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 2
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend_two00(<4 x float> %a, <4 x float> %b) {
+; P9BE-LABEL: fp_extend_two00:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxmrghd vs0, v2, v3
+; P9BE-NEXT:    xvcvspdp v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend_two00:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxmrgld vs0, v3, v2
+; P9LE-NEXT:    xxsldwi vs0, vs0, vs0, 1
+; P9LE-NEXT:    xvcvspdp v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend_two00:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxmrghd vs0, v2, v3
+; P8BE-NEXT:    xvcvspdp v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend_two00:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxmrgld vs0, v3, v2
+; P8LE-NEXT:    xxsldwi vs0, vs0, vs0, 1
+; P8LE-NEXT:    xvcvspdp v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 0
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %b, i32 0
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}
+
+define dso_local <2 x double> @fp_extend_two33(<4 x float> %a, <4 x float> %b) {
+; P9BE-LABEL: fp_extend_two33:
+; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    xxmrgld vs0, v2, v3
+; P9BE-NEXT:    xxsldwi vs0, vs0, vs0, 1
+; P9BE-NEXT:    xvcvspdp v2, vs0
+; P9BE-NEXT:    blr
+;
+; P9LE-LABEL: fp_extend_two33:
+; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    xxmrghd vs0, v3, v2
+; P9LE-NEXT:    xvcvspdp v2, vs0
+; P9LE-NEXT:    blr
+;
+; P8BE-LABEL: fp_extend_two33:
+; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    xxmrgld vs0, v2, v3
+; P8BE-NEXT:    xxsldwi vs0, vs0, vs0, 1
+; P8BE-NEXT:    xvcvspdp v2, vs0
+; P8BE-NEXT:    blr
+;
+; P8LE-LABEL: fp_extend_two33:
+; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    xxmrghd vs0, v3, v2
+; P8LE-NEXT:    xvcvspdp v2, vs0
+; P8LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x float> %a, i32 3
+  %conv = fpext float %vecext to double
+  %vecinit = insertelement <2 x double> undef, double %conv, i32 0
+  %vecext1 = extractelement <4 x float> %b, i32 3
+  %conv2 = fpext float %vecext1 to double
+  %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1
+  ret <2 x double> %vecinit3
+}

diff  --git a/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll b/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll
index f7727d6f4ea1..1dc40edf7146 100644
--- a/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll
+++ b/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll
@@ -47,33 +47,23 @@ define dso_local void @test2(<16 x float>* nocapture readonly %a, <2 x double>*
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxv vs0, 0(r3)
-; CHECK-NEXT:    xxsldwi vs1, vs0, vs0, 1
-; CHECK-NEXT:    xscvspdpn f2, vs0
-; CHECK-NEXT:    xxsldwi vs3, vs0, vs0, 3
-; CHECK-NEXT:    xxswapd vs0, vs0
-; CHECK-NEXT:    xscvspdpn f1, vs1
-; CHECK-NEXT:    xscvspdpn f3, vs3
-; CHECK-NEXT:    xscvspdpn f0, vs0
-; CHECK-NEXT:    xxmrghd vs0, vs0, vs3
-; CHECK-NEXT:    xxmrghd vs1, vs2, vs1
-; CHECK-NEXT:    stxv vs0, 0(r4)
-; CHECK-NEXT:    stxv vs1, 0(r5)
+; CHECK-NEXT:    xxmrglw vs1, vs0, vs0
+; CHECK-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-NEXT:    xvcvspdp vs1, vs1
+; CHECK-NEXT:    xvcvspdp vs0, vs0
+; CHECK-NEXT:    stxv vs1, 0(r4)
+; CHECK-NEXT:    stxv vs0, 0(r5)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test2:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    lxv vs0, 0(r3)
-; CHECK-BE-NEXT:    xxswapd vs1, vs0
-; CHECK-BE-NEXT:    xxsldwi vs2, vs0, vs0, 3
-; CHECK-BE-NEXT:    xscvspdpn f3, vs0
-; CHECK-BE-NEXT:    xxsldwi vs0, vs0, vs0, 1
-; CHECK-BE-NEXT:    xscvspdpn f1, vs1
-; CHECK-BE-NEXT:    xscvspdpn f2, vs2
-; CHECK-BE-NEXT:    xscvspdpn f0, vs0
-; CHECK-BE-NEXT:    xxmrghd vs0, vs3, vs0
-; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs2
-; CHECK-BE-NEXT:    stxv vs0, 0(r4)
-; CHECK-BE-NEXT:    stxv vs1, 0(r5)
+; CHECK-BE-NEXT:    xxmrghw vs1, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw vs0, vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs1, vs1
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
+; CHECK-BE-NEXT:    stxv vs1, 0(r4)
+; CHECK-BE-NEXT:    stxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load <16 x float>, <16 x float>* %a, align 16

diff  --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll
index cf4a6d636207..d355dcd08b0f 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll
@@ -14,10 +14,8 @@ define <2 x i64> @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mtvsrd f0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
-; CHECK-P8-NEXT:    xscvspdpn f0, vs0
-; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-P8-NEXT:    blr
 ;
@@ -25,20 +23,16 @@ define <2 x i64> @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
 ; CHECK-P9:       # %bb.0: # %entry
 ; CHECK-P9-NEXT:    mtvsrd f0, r3
 ; CHECK-P9-NEXT:    xxswapd v2, vs0
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-P9-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test2elt:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    mtvsrd f0, r3
-; CHECK-BE-NEXT:    xscvspdpn f1, vs0
-; CHECK-BE-NEXT:    xxsldwi vs0, vs0, vs0, 1
-; CHECK-BE-NEXT:    xscvspdpn f0, vs0
-; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-BE-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-BE-NEXT:    blr
 entry:
@@ -50,16 +44,11 @@ entry:
 define void @test4elt(<4 x i64>* noalias nocapture sret %agg.result, <4 x float> %a) local_unnamed_addr #1 {
 ; CHECK-P8-LABEL: test4elt:
 ; CHECK-P8:       # %bb.0: # %entry
-; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs1, v2, v2
 ; CHECK-P8-NEXT:    li r4, 16
-; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
-; CHECK-P8-NEXT:    xscvspdpn f2, v2
-; CHECK-P8-NEXT:    xscvspdpn f0, vs0
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xscvspdpn f3, vs3
-; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
-; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P8-NEXT:    xvcvspdp vs1, vs1
 ; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
 ; CHECK-P8-NEXT:    xxswapd vs1, v2
@@ -70,36 +59,26 @@ define void @test4elt(<4 x i64>* noalias nocapture sret %agg.result, <4 x float>
 ;
 ; CHECK-P9-LABEL: test4elt:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-P9-NEXT:    xxswapd vs1, v2
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
-; CHECK-P9-NEXT:    xscvspdpn f1, v2
-; CHECK-P9-NEXT:    xxmrghd vs1, vs1, vs2
+; CHECK-P9-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P9-NEXT:    xxmrghw vs1, v2, v2
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P9-NEXT:    xvcvspdp vs1, vs1
 ; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
 ; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test4elt:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    xxsldwi vs1, v2, v2, 1
-; CHECK-BE-NEXT:    xscvspdpn f0, v2
-; CHECK-BE-NEXT:    xxswapd vs2, v2
-; CHECK-BE-NEXT:    xscvspdpn f1, vs1
-; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
-; CHECK-BE-NEXT:    xxsldwi vs1, v2, v2, 3
-; CHECK-BE-NEXT:    xscvspdpn f1, vs1
-; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xxmrghw vs0, v2, v2
+; CHECK-BE-NEXT:    xxmrglw vs1, v2, v2
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs1, vs1
 ; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-BE-NEXT:    xxmrghd vs1, vs2, vs1
 ; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = fptoui <4 x float> %a to <4 x i64>
@@ -115,31 +94,21 @@ define void @test8elt(<8 x i64>* noalias nocapture sret %agg.result, <8 x float>
 ; CHECK-P8-NEXT:    li r6, 32
 ; CHECK-P8-NEXT:    lvx v2, r4, r5
 ; CHECK-P8-NEXT:    li r4, 48
-; CHECK-P8-NEXT:    xxsldwi vs5, v3, v3, 3
-; CHECK-P8-NEXT:    xxswapd vs6, v3
-; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-P8-NEXT:    xxswapd vs1, v2
-; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
-; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 1
-; CHECK-P8-NEXT:    xscvspdpn f2, v2
-; CHECK-P8-NEXT:    xscvspdpn f4, v3
-; CHECK-P8-NEXT:    xscvspdpn f0, vs0
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xscvspdpn f3, vs3
-; CHECK-P8-NEXT:    xscvspdpn f5, vs5
-; CHECK-P8-NEXT:    xscvspdpn f6, vs6
-; CHECK-P8-NEXT:    xscvspdpn f7, vs7
-; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
-; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
-; CHECK-P8-NEXT:    xxmrghd vs2, vs6, vs5
+; CHECK-P8-NEXT:    xxmrglw vs2, v3, v3
+; CHECK-P8-NEXT:    xxmrghw vs3, v3, v3
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs1, v2, v2
+; CHECK-P8-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P8-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P8-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs2
 ; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
-; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs7
 ; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
-; CHECK-P8-NEXT:    xvcvdpuxds v4, vs2
 ; CHECK-P8-NEXT:    xvcvdpuxds v5, vs3
+; CHECK-P8-NEXT:    xxswapd vs3, v4
 ; CHECK-P8-NEXT:    xxswapd vs1, v2
 ; CHECK-P8-NEXT:    xxswapd vs0, v3
-; CHECK-P8-NEXT:    xxswapd vs3, v4
 ; CHECK-P8-NEXT:    xxswapd vs2, v5
 ; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
 ; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
@@ -149,65 +118,45 @@ define void @test8elt(<8 x i64>* noalias nocapture sret %agg.result, <8 x float>
 ;
 ; CHECK-P9-LABEL: test8elt:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    lxv vs0, 0(r4)
-; CHECK-P9-NEXT:    xxsldwi vs1, vs0, vs0, 3
-; CHECK-P9-NEXT:    xxswapd vs2, vs0
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xscvspdpn f3, vs0
-; CHECK-P9-NEXT:    xxsldwi vs0, vs0, vs0, 1
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xxmrghd vs1, vs2, vs1
-; CHECK-P9-NEXT:    lxv vs2, 16(r4)
-; CHECK-P9-NEXT:    xxmrghd vs0, vs3, vs0
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xxmrglw vs2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw vs1, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw vs3, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-P9-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P9-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P9-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-P9-NEXT:    xxsldwi vs3, vs2, vs2, 3
-; CHECK-P9-NEXT:    xxswapd vs4, vs2
-; CHECK-P9-NEXT:    xscvspdpn f3, vs3
-; CHECK-P9-NEXT:    xscvspdpn f4, vs4
-; CHECK-P9-NEXT:    stxv vs0, 16(r3)
-; CHECK-P9-NEXT:    xxmrghd vs3, vs4, vs3
-; CHECK-P9-NEXT:    xscvspdpn f4, vs2
-; CHECK-P9-NEXT:    xxsldwi vs2, vs2, vs2, 1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
-; CHECK-P9-NEXT:    xxmrghd vs2, vs4, vs2
-; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
 ; CHECK-P9-NEXT:    stxv vs3, 32(r3)
-; CHECK-P9-NEXT:    stxv vs2, 48(r3)
-; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs2, 0(r3)
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test8elt:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv vs1, 0(r4)
-; CHECK-BE-NEXT:    xxsldwi vs3, vs1, vs1, 1
-; CHECK-BE-NEXT:    xscvspdpn f2, vs1
-; CHECK-BE-NEXT:    xscvspdpn f3, vs3
 ; CHECK-BE-NEXT:    lxv vs0, 16(r4)
-; CHECK-BE-NEXT:    xxsldwi vs4, vs0, vs0, 1
-; CHECK-BE-NEXT:    xscvspdpn f4, vs4
-; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs3
-; CHECK-BE-NEXT:    xxsldwi vs3, vs1, vs1, 3
-; CHECK-BE-NEXT:    xxswapd vs1, vs1
-; CHECK-BE-NEXT:    xscvspdpn f3, vs3
-; CHECK-BE-NEXT:    xscvspdpn f1, vs1
-; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs3
-; CHECK-BE-NEXT:    xscvspdpn f3, vs0
-; CHECK-BE-NEXT:    xxmrghd vs3, vs3, vs4
-; CHECK-BE-NEXT:    xxsldwi vs4, vs0, vs0, 3
-; CHECK-BE-NEXT:    xxswapd vs0, vs0
-; CHECK-BE-NEXT:    xscvspdpn f0, vs0
-; CHECK-BE-NEXT:    xscvspdpn f4, vs4
-; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs4
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xxmrghw vs2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw vs1, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw vs3, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw vs0, vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs2, vs2
+; CHECK-BE-NEXT:    xvcvspdp vs1, vs1
+; CHECK-BE-NEXT:    xvcvspdp vs3, vs3
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
 ; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
-; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-BE-NEXT:    stxv vs3, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs3, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 0(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
@@ -220,70 +169,50 @@ entry:
 define void @test16elt(<16 x i64>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 {
 ; CHECK-P8-LABEL: test16elt:
 ; CHECK-P8:       # %bb.0: # %entry
-; CHECK-P8-NEXT:    li r5, 16
 ; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    li r5, 16
 ; CHECK-P8-NEXT:    li r6, 32
-; CHECK-P8-NEXT:    lvx v4, 0, r4
 ; CHECK-P8-NEXT:    li r8, 64
-; CHECK-P8-NEXT:    lvx v5, r4, r5
-; CHECK-P8-NEXT:    lvx v3, r4, r7
-; CHECK-P8-NEXT:    lvx v2, r4, r6
+; CHECK-P8-NEXT:    lvx v4, r4, r7
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    xxmrghw vs3, v4, v4
+; CHECK-P8-NEXT:    xxmrglw vs5, v4, v4
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs1, v2, v2
+; CHECK-P8-NEXT:    lvx v2, 0, r4
 ; CHECK-P8-NEXT:    li r4, 112
-; CHECK-P8-NEXT:    xxsldwi vs13, v4, v4, 3
-; CHECK-P8-NEXT:    xscvspdpn f6, v4
-; CHECK-P8-NEXT:    xxsldwi vs1, v5, v5, 3
-; CHECK-P8-NEXT:    xxswapd vs3, v5
-; CHECK-P8-NEXT:    xxsldwi vs9, v3, v3, 1
-; CHECK-P8-NEXT:    xscvspdpn f4, v3
-; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
-; CHECK-P8-NEXT:    xxsldwi vs10, v3, v3, 3
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xxswapd vs11, v3
-; CHECK-P8-NEXT:    xscvspdpn f3, vs3
-; CHECK-P8-NEXT:    xxsldwi vs7, v2, v2, 3
-; CHECK-P8-NEXT:    xscvspdpn f9, vs9
-; CHECK-P8-NEXT:    xxswapd vs8, v2
-; CHECK-P8-NEXT:    xscvspdpn f0, v5
-; CHECK-P8-NEXT:    xxsldwi vs12, v2, v2, 1
-; CHECK-P8-NEXT:    xscvspdpn f2, v2
-; CHECK-P8-NEXT:    xxswapd v2, v4
-; CHECK-P8-NEXT:    xscvspdpn f5, vs5
-; CHECK-P8-NEXT:    xxsldwi v3, v4, v4, 1
-; CHECK-P8-NEXT:    xscvspdpn f10, vs10
-; CHECK-P8-NEXT:    xscvspdpn f11, vs11
-; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs1
-; CHECK-P8-NEXT:    xscvspdpn f7, vs7
-; CHECK-P8-NEXT:    xxmrghd vs4, vs4, vs9
-; CHECK-P8-NEXT:    xscvspdpn f8, vs8
-; CHECK-P8-NEXT:    xscvspdpn f12, vs12
-; CHECK-P8-NEXT:    xscvspdpn f13, vs13
-; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs5
-; CHECK-P8-NEXT:    xscvspdpn f3, v2
-; CHECK-P8-NEXT:    xscvspdpn f9, v3
-; CHECK-P8-NEXT:    xxmrghd vs5, vs11, vs10
-; CHECK-P8-NEXT:    xvcvdpuxds v3, vs4
-; CHECK-P8-NEXT:    xvcvdpuxds v2, vs1
-; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs12
-; CHECK-P8-NEXT:    xxmrghd vs2, vs8, vs7
-; CHECK-P8-NEXT:    xvcvdpuxds v4, vs0
-; CHECK-P8-NEXT:    xxmrghd vs0, vs3, vs13
+; CHECK-P8-NEXT:    xxmrglw vs2, v3, v3
+; CHECK-P8-NEXT:    xxmrghw vs4, v3, v3
+; CHECK-P8-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P8-NEXT:    xxmrglw vs6, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs7, v2, v2
+; CHECK-P8-NEXT:    xvcvspdp vs5, vs5
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P8-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P8-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P8-NEXT:    xvcvspdp vs4, vs4
+; CHECK-P8-NEXT:    xvcvspdp vs6, vs6
+; CHECK-P8-NEXT:    xvcvspdp vs7, vs7
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs3
 ; CHECK-P8-NEXT:    xvcvdpuxds v5, vs5
-; CHECK-P8-NEXT:    xxmrghd vs3, vs6, vs9
-; CHECK-P8-NEXT:    xvcvdpuxds v0, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v0, vs4
 ; CHECK-P8-NEXT:    xvcvdpuxds v1, vs2
-; CHECK-P8-NEXT:    xvcvdpuxds v6, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds v6, vs6
 ; CHECK-P8-NEXT:    xxswapd vs0, v3
-; CHECK-P8-NEXT:    xvcvdpuxds v7, vs3
-; CHECK-P8-NEXT:    xxswapd vs4, v2
-; CHECK-P8-NEXT:    xxswapd vs3, v4
+; CHECK-P8-NEXT:    xvcvdpuxds v7, vs7
 ; CHECK-P8-NEXT:    xxswapd vs1, v5
+; CHECK-P8-NEXT:    xxswapd vs4, v2
 ; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
 ; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xxswapd vs3, v4
 ; CHECK-P8-NEXT:    xxswapd vs2, v0
-; CHECK-P8-NEXT:    xxswapd vs0, v1
 ; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
-; CHECK-P8-NEXT:    xxswapd vs5, v6
 ; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs0, v1
+; CHECK-P8-NEXT:    xxswapd vs5, v6
 ; CHECK-P8-NEXT:    xxswapd vs1, v7
 ; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
 ; CHECK-P8-NEXT:    stxvd2x vs0, r3, r8
@@ -295,122 +224,82 @@ define void @test16elt(<16 x i64>* noalias nocapture sret %agg.result, <16 x flo
 ;
 ; CHECK-P9-LABEL: test16elt:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    lxv vs4, 16(r4)
-; CHECK-P9-NEXT:    xxsldwi vs5, vs4, vs4, 3
-; CHECK-P9-NEXT:    xxswapd vs6, vs4
-; CHECK-P9-NEXT:    lxv vs0, 0(r4)
-; CHECK-P9-NEXT:    xxsldwi vs1, vs0, vs0, 3
-; CHECK-P9-NEXT:    xxswapd vs2, vs0
-; CHECK-P9-NEXT:    xscvspdpn f5, vs5
-; CHECK-P9-NEXT:    xscvspdpn f6, vs6
-; CHECK-P9-NEXT:    xxmrghd vs5, vs6, vs5
-; CHECK-P9-NEXT:    xscvspdpn f6, vs4
-; CHECK-P9-NEXT:    xxsldwi vs4, vs4, vs4, 1
-; CHECK-P9-NEXT:    lxv vs3, 32(r4)
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xxswapd vs7, vs3
-; CHECK-P9-NEXT:    xscvspdpn f7, vs7
-; CHECK-P9-NEXT:    xscvspdpn f4, vs4
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xxmrghd vs1, vs2, vs1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs0
-; CHECK-P9-NEXT:    xxsldwi vs0, vs0, vs0, 1
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xxmrghd vs0, vs2, vs0
-; CHECK-P9-NEXT:    xxmrghd vs4, vs6, vs4
-; CHECK-P9-NEXT:    xxsldwi vs6, vs3, vs3, 3
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    lxv vs3, 16(r4)
+; CHECK-P9-NEXT:    lxv vs5, 32(r4)
+; CHECK-P9-NEXT:    xxmrglw vs2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw vs1, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw vs4, vs3, vs3
+; CHECK-P9-NEXT:    xxmrghw vs3, vs3, vs3
+; CHECK-P9-NEXT:    xxmrglw vs6, vs5, vs5
+; CHECK-P9-NEXT:    xxmrghw vs5, vs5, vs5
+; CHECK-P9-NEXT:    xxmrglw vs7, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-P9-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P9-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P9-NEXT:    xvcvspdp vs4, vs4
+; CHECK-P9-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P9-NEXT:    xvcvspdp vs6, vs6
+; CHECK-P9-NEXT:    xvcvspdp vs5, vs5
+; CHECK-P9-NEXT:    xvcvspdp vs7, vs7
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-P9-NEXT:    xvcvdpuxds vs5, vs5
-; CHECK-P9-NEXT:    xscvspdpn f6, vs6
-; CHECK-P9-NEXT:    xxmrghd vs6, vs7, vs6
-; CHECK-P9-NEXT:    xscvspdpn f7, vs3
-; CHECK-P9-NEXT:    xxsldwi vs3, vs3, vs3, 1
-; CHECK-P9-NEXT:    lxv vs2, 48(r4)
-; CHECK-P9-NEXT:    xxswapd vs8, vs2
-; CHECK-P9-NEXT:    xscvspdpn f8, vs8
 ; CHECK-P9-NEXT:    xvcvdpuxds vs4, vs4
-; CHECK-P9-NEXT:    xscvspdpn f3, vs3
-; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs3
-; CHECK-P9-NEXT:    xxsldwi vs7, vs2, vs2, 3
-; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-P9-NEXT:    xvcvdpuxds vs6, vs6
-; CHECK-P9-NEXT:    stxv vs6, 64(r3)
-; CHECK-P9-NEXT:    xscvspdpn f7, vs7
-; CHECK-P9-NEXT:    xxmrghd vs7, vs8, vs7
-; CHECK-P9-NEXT:    xscvspdpn f8, vs2
-; CHECK-P9-NEXT:    xxsldwi vs2, vs2, vs2, 1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xxmrghd vs2, vs8, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-P9-NEXT:    xvcvdpuxds vs5, vs5
 ; CHECK-P9-NEXT:    xvcvdpuxds vs7, vs7
-; CHECK-P9-NEXT:    stxv vs3, 80(r3)
-; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 112(r3)
 ; CHECK-P9-NEXT:    stxv vs7, 96(r3)
-; CHECK-P9-NEXT:    stxv vs2, 112(r3)
-; CHECK-P9-NEXT:    stxv vs4, 48(r3)
-; CHECK-P9-NEXT:    stxv vs5, 32(r3)
-; CHECK-P9-NEXT:    stxv vs0, 16(r3)
-; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs6, 64(r3)
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs4, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs2, 0(r3)
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test16elt:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv vs4, 16(r4)
-; CHECK-BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
-; CHECK-BE-NEXT:    xscvspdpn f1, vs0
-; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 3
-; CHECK-BE-NEXT:    xxswapd vs0, vs0
-; CHECK-BE-NEXT:    xscvspdpn f5, vs5
-; CHECK-BE-NEXT:    xscvspdpn f0, vs0
-; CHECK-BE-NEXT:    xxsldwi vs6, vs4, vs4, 1
-; CHECK-BE-NEXT:    xscvspdpn f6, vs6
-; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs5
-; CHECK-BE-NEXT:    xscvspdpn f5, vs4
-; CHECK-BE-NEXT:    lxv vs3, 32(r4)
-; CHECK-BE-NEXT:    xxsldwi vs7, vs3, vs3, 1
-; CHECK-BE-NEXT:    xscvspdpn f7, vs7
-; CHECK-BE-NEXT:    xxmrghd vs5, vs5, vs6
-; CHECK-BE-NEXT:    xxsldwi vs6, vs4, vs4, 3
-; CHECK-BE-NEXT:    xxswapd vs4, vs4
-; CHECK-BE-NEXT:    xscvspdpn f6, vs6
-; CHECK-BE-NEXT:    xscvspdpn f4, vs4
-; CHECK-BE-NEXT:    xscvspdpn f2, vs2
-; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs2
-; CHECK-BE-NEXT:    lxv vs2, 48(r4)
-; CHECK-BE-NEXT:    xxsldwi vs8, vs2, vs2, 1
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    lxv vs3, 16(r4)
+; CHECK-BE-NEXT:    lxv vs5, 32(r4)
+; CHECK-BE-NEXT:    xxmrghw vs2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw vs1, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw vs4, vs3, vs3
+; CHECK-BE-NEXT:    xxmrglw vs3, vs3, vs3
+; CHECK-BE-NEXT:    xxmrghw vs6, vs5, vs5
+; CHECK-BE-NEXT:    xxmrglw vs5, vs5, vs5
+; CHECK-BE-NEXT:    xxmrghw vs7, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw vs0, vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs2, vs2
+; CHECK-BE-NEXT:    xvcvspdp vs1, vs1
+; CHECK-BE-NEXT:    xvcvspdp vs4, vs4
+; CHECK-BE-NEXT:    xvcvspdp vs3, vs3
+; CHECK-BE-NEXT:    xvcvspdp vs6, vs6
+; CHECK-BE-NEXT:    xvcvspdp vs5, vs5
+; CHECK-BE-NEXT:    xvcvspdp vs7, vs7
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-BE-NEXT:    xvcvdpuxds vs5, vs5
-; CHECK-BE-NEXT:    xscvspdpn f8, vs8
-; CHECK-BE-NEXT:    xxmrghd vs4, vs4, vs6
-; CHECK-BE-NEXT:    xscvspdpn f6, vs3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
-; CHECK-BE-NEXT:    xxmrghd vs6, vs6, vs7
-; CHECK-BE-NEXT:    xxsldwi vs7, vs3, vs3, 3
-; CHECK-BE-NEXT:    xxswapd vs3, vs3
-; CHECK-BE-NEXT:    xscvspdpn f7, vs7
-; CHECK-BE-NEXT:    xscvspdpn f3, vs3
-; CHECK-BE-NEXT:    xxmrghd vs3, vs3, vs7
-; CHECK-BE-NEXT:    xscvspdpn f7, vs2
-; CHECK-BE-NEXT:    xxmrghd vs7, vs7, vs8
-; CHECK-BE-NEXT:    xxsldwi vs8, vs2, vs2, 3
-; CHECK-BE-NEXT:    xxswapd vs2, vs2
-; CHECK-BE-NEXT:    xscvspdpn f8, vs8
-; CHECK-BE-NEXT:    xscvspdpn f2, vs2
-; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs8
-; CHECK-BE-NEXT:    stxv vs5, 32(r3)
 ; CHECK-BE-NEXT:    xvcvdpuxds vs4, vs4
-; CHECK-BE-NEXT:    xvcvdpuxds vs6, vs6
 ; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-BE-NEXT:    xvcvdpuxds vs5, vs5
 ; CHECK-BE-NEXT:    xvcvdpuxds vs7, vs7
-; CHECK-BE-NEXT:    stxv vs3, 80(r3)
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs7, 96(r3)
-; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
-; CHECK-BE-NEXT:    stxv vs2, 112(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
 ; CHECK-BE-NEXT:    stxv vs6, 64(r3)
-; CHECK-BE-NEXT:    stxv vs4, 48(r3)
-; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs4, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs2, 0(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %a = load <16 x float>, <16 x float>* %0, align 64
@@ -424,10 +313,8 @@ define <2 x i64> @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mtvsrd f0, r3
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
-; CHECK-P8-NEXT:    xscvspdpn f0, vs0
-; CHECK-P8-NEXT:    xxsldwi vs1, v2, v2, 3
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-P8-NEXT:    blr
 ;
@@ -435,20 +322,16 @@ define <2 x i64> @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
 ; CHECK-P9:       # %bb.0: # %entry
 ; CHECK-P9-NEXT:    mtvsrd f0, r3
 ; CHECK-P9-NEXT:    xxswapd v2, vs0
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xxsldwi vs1, v2, v2, 3
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xxmrghd vs0, vs0, vs1
+; CHECK-P9-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-P9-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test2elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    mtvsrd f0, r3
-; CHECK-BE-NEXT:    xscvspdpn f1, vs0
-; CHECK-BE-NEXT:    xxsldwi vs0, vs0, vs0, 1
-; CHECK-BE-NEXT:    xscvspdpn f0, vs0
-; CHECK-BE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-BE-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-BE-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-BE-NEXT:    blr
 entry:
@@ -460,16 +343,11 @@ entry:
 define void @test4elt_signed(<4 x i64>* noalias nocapture sret %agg.result, <4 x float> %a) local_unnamed_addr #1 {
 ; CHECK-P8-LABEL: test4elt_signed:
 ; CHECK-P8:       # %bb.0: # %entry
-; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs1, v2, v2
 ; CHECK-P8-NEXT:    li r4, 16
-; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
-; CHECK-P8-NEXT:    xscvspdpn f2, v2
-; CHECK-P8-NEXT:    xscvspdpn f0, vs0
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xscvspdpn f3, vs3
-; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
-; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P8-NEXT:    xvcvspdp vs1, vs1
 ; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
 ; CHECK-P8-NEXT:    xxswapd vs1, v2
@@ -480,36 +358,26 @@ define void @test4elt_signed(<4 x i64>* noalias nocapture sret %agg.result, <4 x
 ;
 ; CHECK-P9-LABEL: test4elt_signed:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-P9-NEXT:    xxswapd vs1, v2
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xxsldwi vs2, v2, v2, 1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
-; CHECK-P9-NEXT:    xscvspdpn f1, v2
-; CHECK-P9-NEXT:    xxmrghd vs1, vs1, vs2
+; CHECK-P9-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P9-NEXT:    xxmrghw vs1, v2, v2
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P9-NEXT:    xvcvspdp vs1, vs1
 ; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
 ; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test4elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    xxsldwi vs1, v2, v2, 1
-; CHECK-BE-NEXT:    xscvspdpn f0, v2
-; CHECK-BE-NEXT:    xxswapd vs2, v2
-; CHECK-BE-NEXT:    xscvspdpn f1, vs1
-; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs1
-; CHECK-BE-NEXT:    xxsldwi vs1, v2, v2, 3
-; CHECK-BE-NEXT:    xscvspdpn f1, vs1
-; CHECK-BE-NEXT:    xscvspdpn f2, vs2
+; CHECK-BE-NEXT:    xxmrghw vs0, v2, v2
+; CHECK-BE-NEXT:    xxmrglw vs1, v2, v2
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs1, vs1
 ; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-BE-NEXT:    xxmrghd vs1, vs2, vs1
 ; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = fptoui <4 x float> %a to <4 x i64>
@@ -525,31 +393,21 @@ define void @test8elt_signed(<8 x i64>* noalias nocapture sret %agg.result, <8 x
 ; CHECK-P8-NEXT:    li r6, 32
 ; CHECK-P8-NEXT:    lvx v2, r4, r5
 ; CHECK-P8-NEXT:    li r4, 48
-; CHECK-P8-NEXT:    xxsldwi vs5, v3, v3, 3
-; CHECK-P8-NEXT:    xxswapd vs6, v3
-; CHECK-P8-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-P8-NEXT:    xxswapd vs1, v2
-; CHECK-P8-NEXT:    xxsldwi vs3, v2, v2, 1
-; CHECK-P8-NEXT:    xxsldwi vs7, v3, v3, 1
-; CHECK-P8-NEXT:    xscvspdpn f2, v2
-; CHECK-P8-NEXT:    xscvspdpn f4, v3
-; CHECK-P8-NEXT:    xscvspdpn f0, vs0
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xscvspdpn f3, vs3
-; CHECK-P8-NEXT:    xscvspdpn f5, vs5
-; CHECK-P8-NEXT:    xscvspdpn f6, vs6
-; CHECK-P8-NEXT:    xscvspdpn f7, vs7
-; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
-; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs3
-; CHECK-P8-NEXT:    xxmrghd vs2, vs6, vs5
+; CHECK-P8-NEXT:    xxmrglw vs2, v3, v3
+; CHECK-P8-NEXT:    xxmrghw vs3, v3, v3
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs1, v2, v2
+; CHECK-P8-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P8-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P8-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs2
 ; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
-; CHECK-P8-NEXT:    xxmrghd vs3, vs4, vs7
 ; CHECK-P8-NEXT:    xvcvdpuxds v3, vs1
-; CHECK-P8-NEXT:    xvcvdpuxds v4, vs2
 ; CHECK-P8-NEXT:    xvcvdpuxds v5, vs3
+; CHECK-P8-NEXT:    xxswapd vs3, v4
 ; CHECK-P8-NEXT:    xxswapd vs1, v2
 ; CHECK-P8-NEXT:    xxswapd vs0, v3
-; CHECK-P8-NEXT:    xxswapd vs3, v4
 ; CHECK-P8-NEXT:    xxswapd vs2, v5
 ; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
 ; CHECK-P8-NEXT:    stxvd2x vs1, r3, r6
@@ -559,65 +417,45 @@ define void @test8elt_signed(<8 x i64>* noalias nocapture sret %agg.result, <8 x
 ;
 ; CHECK-P9-LABEL: test8elt_signed:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    lxv vs0, 0(r4)
-; CHECK-P9-NEXT:    xxsldwi vs1, vs0, vs0, 3
-; CHECK-P9-NEXT:    xxswapd vs2, vs0
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xscvspdpn f3, vs0
-; CHECK-P9-NEXT:    xxsldwi vs0, vs0, vs0, 1
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xxmrghd vs1, vs2, vs1
-; CHECK-P9-NEXT:    lxv vs2, 16(r4)
-; CHECK-P9-NEXT:    xxmrghd vs0, vs3, vs0
+; CHECK-P9-NEXT:    lxv vs0, 16(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    xxmrglw vs2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw vs1, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw vs3, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-P9-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P9-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P9-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-P9-NEXT:    xxsldwi vs3, vs2, vs2, 3
-; CHECK-P9-NEXT:    xxswapd vs4, vs2
-; CHECK-P9-NEXT:    xscvspdpn f3, vs3
-; CHECK-P9-NEXT:    xscvspdpn f4, vs4
-; CHECK-P9-NEXT:    stxv vs0, 16(r3)
-; CHECK-P9-NEXT:    xxmrghd vs3, vs4, vs3
-; CHECK-P9-NEXT:    xscvspdpn f4, vs2
-; CHECK-P9-NEXT:    xxsldwi vs2, vs2, vs2, 1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
-; CHECK-P9-NEXT:    xxmrghd vs2, vs4, vs2
-; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 48(r3)
 ; CHECK-P9-NEXT:    stxv vs3, 32(r3)
-; CHECK-P9-NEXT:    stxv vs2, 48(r3)
-; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs2, 0(r3)
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test8elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv vs1, 0(r4)
-; CHECK-BE-NEXT:    xxsldwi vs3, vs1, vs1, 1
-; CHECK-BE-NEXT:    xscvspdpn f2, vs1
-; CHECK-BE-NEXT:    xscvspdpn f3, vs3
 ; CHECK-BE-NEXT:    lxv vs0, 16(r4)
-; CHECK-BE-NEXT:    xxsldwi vs4, vs0, vs0, 1
-; CHECK-BE-NEXT:    xscvspdpn f4, vs4
-; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs3
-; CHECK-BE-NEXT:    xxsldwi vs3, vs1, vs1, 3
-; CHECK-BE-NEXT:    xxswapd vs1, vs1
-; CHECK-BE-NEXT:    xscvspdpn f3, vs3
-; CHECK-BE-NEXT:    xscvspdpn f1, vs1
-; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs3
-; CHECK-BE-NEXT:    xscvspdpn f3, vs0
-; CHECK-BE-NEXT:    xxmrghd vs3, vs3, vs4
-; CHECK-BE-NEXT:    xxsldwi vs4, vs0, vs0, 3
-; CHECK-BE-NEXT:    xxswapd vs0, vs0
-; CHECK-BE-NEXT:    xscvspdpn f0, vs0
-; CHECK-BE-NEXT:    xscvspdpn f4, vs4
-; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs4
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    xxmrghw vs2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw vs1, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw vs3, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw vs0, vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs2, vs2
+; CHECK-BE-NEXT:    xvcvspdp vs1, vs1
+; CHECK-BE-NEXT:    xvcvspdp vs3, vs3
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
 ; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
-; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-BE-NEXT:    stxv vs3, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs3, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 0(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
@@ -630,70 +468,50 @@ entry:
 define void @test16elt_signed(<16 x i64>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 {
 ; CHECK-P8-LABEL: test16elt_signed:
 ; CHECK-P8:       # %bb.0: # %entry
-; CHECK-P8-NEXT:    li r5, 16
 ; CHECK-P8-NEXT:    li r7, 48
+; CHECK-P8-NEXT:    li r5, 16
 ; CHECK-P8-NEXT:    li r6, 32
-; CHECK-P8-NEXT:    lvx v4, 0, r4
 ; CHECK-P8-NEXT:    li r8, 64
-; CHECK-P8-NEXT:    lvx v5, r4, r5
-; CHECK-P8-NEXT:    lvx v3, r4, r7
-; CHECK-P8-NEXT:    lvx v2, r4, r6
+; CHECK-P8-NEXT:    lvx v4, r4, r7
+; CHECK-P8-NEXT:    lvx v2, r4, r5
+; CHECK-P8-NEXT:    lvx v3, r4, r6
+; CHECK-P8-NEXT:    xxmrghw vs3, v4, v4
+; CHECK-P8-NEXT:    xxmrglw vs5, v4, v4
+; CHECK-P8-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs1, v2, v2
+; CHECK-P8-NEXT:    lvx v2, 0, r4
 ; CHECK-P8-NEXT:    li r4, 112
-; CHECK-P8-NEXT:    xxsldwi vs13, v4, v4, 3
-; CHECK-P8-NEXT:    xscvspdpn f6, v4
-; CHECK-P8-NEXT:    xxsldwi vs1, v5, v5, 3
-; CHECK-P8-NEXT:    xxswapd vs3, v5
-; CHECK-P8-NEXT:    xxsldwi vs9, v3, v3, 1
-; CHECK-P8-NEXT:    xscvspdpn f4, v3
-; CHECK-P8-NEXT:    xxsldwi vs5, v5, v5, 1
-; CHECK-P8-NEXT:    xxsldwi vs10, v3, v3, 3
-; CHECK-P8-NEXT:    xscvspdpn f1, vs1
-; CHECK-P8-NEXT:    xxswapd vs11, v3
-; CHECK-P8-NEXT:    xscvspdpn f3, vs3
-; CHECK-P8-NEXT:    xxsldwi vs7, v2, v2, 3
-; CHECK-P8-NEXT:    xscvspdpn f9, vs9
-; CHECK-P8-NEXT:    xxswapd vs8, v2
-; CHECK-P8-NEXT:    xscvspdpn f0, v5
-; CHECK-P8-NEXT:    xxsldwi vs12, v2, v2, 1
-; CHECK-P8-NEXT:    xscvspdpn f2, v2
-; CHECK-P8-NEXT:    xxswapd v2, v4
-; CHECK-P8-NEXT:    xscvspdpn f5, vs5
-; CHECK-P8-NEXT:    xxsldwi v3, v4, v4, 1
-; CHECK-P8-NEXT:    xscvspdpn f10, vs10
-; CHECK-P8-NEXT:    xscvspdpn f11, vs11
-; CHECK-P8-NEXT:    xxmrghd vs1, vs3, vs1
-; CHECK-P8-NEXT:    xscvspdpn f7, vs7
-; CHECK-P8-NEXT:    xxmrghd vs4, vs4, vs9
-; CHECK-P8-NEXT:    xscvspdpn f8, vs8
-; CHECK-P8-NEXT:    xscvspdpn f12, vs12
-; CHECK-P8-NEXT:    xscvspdpn f13, vs13
-; CHECK-P8-NEXT:    xxmrghd vs0, vs0, vs5
-; CHECK-P8-NEXT:    xscvspdpn f3, v2
-; CHECK-P8-NEXT:    xscvspdpn f9, v3
-; CHECK-P8-NEXT:    xxmrghd vs5, vs11, vs10
-; CHECK-P8-NEXT:    xvcvdpuxds v3, vs4
-; CHECK-P8-NEXT:    xvcvdpuxds v2, vs1
-; CHECK-P8-NEXT:    xxmrghd vs1, vs2, vs12
-; CHECK-P8-NEXT:    xxmrghd vs2, vs8, vs7
-; CHECK-P8-NEXT:    xvcvdpuxds v4, vs0
-; CHECK-P8-NEXT:    xxmrghd vs0, vs3, vs13
+; CHECK-P8-NEXT:    xxmrglw vs2, v3, v3
+; CHECK-P8-NEXT:    xxmrghw vs4, v3, v3
+; CHECK-P8-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P8-NEXT:    xxmrglw vs6, v2, v2
+; CHECK-P8-NEXT:    xxmrghw vs7, v2, v2
+; CHECK-P8-NEXT:    xvcvspdp vs5, vs5
+; CHECK-P8-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P8-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P8-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P8-NEXT:    xvcvspdp vs4, vs4
+; CHECK-P8-NEXT:    xvcvspdp vs6, vs6
+; CHECK-P8-NEXT:    xvcvspdp vs7, vs7
+; CHECK-P8-NEXT:    xvcvdpuxds v3, vs3
 ; CHECK-P8-NEXT:    xvcvdpuxds v5, vs5
-; CHECK-P8-NEXT:    xxmrghd vs3, vs6, vs9
-; CHECK-P8-NEXT:    xvcvdpuxds v0, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v2, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds v4, vs1
+; CHECK-P8-NEXT:    xvcvdpuxds v0, vs4
 ; CHECK-P8-NEXT:    xvcvdpuxds v1, vs2
-; CHECK-P8-NEXT:    xvcvdpuxds v6, vs0
+; CHECK-P8-NEXT:    xvcvdpuxds v6, vs6
 ; CHECK-P8-NEXT:    xxswapd vs0, v3
-; CHECK-P8-NEXT:    xvcvdpuxds v7, vs3
-; CHECK-P8-NEXT:    xxswapd vs4, v2
-; CHECK-P8-NEXT:    xxswapd vs3, v4
+; CHECK-P8-NEXT:    xvcvdpuxds v7, vs7
 ; CHECK-P8-NEXT:    xxswapd vs1, v5
+; CHECK-P8-NEXT:    xxswapd vs4, v2
 ; CHECK-P8-NEXT:    stxvd2x vs0, r3, r4
 ; CHECK-P8-NEXT:    li r4, 96
+; CHECK-P8-NEXT:    xxswapd vs3, v4
 ; CHECK-P8-NEXT:    xxswapd vs2, v0
-; CHECK-P8-NEXT:    xxswapd vs0, v1
 ; CHECK-P8-NEXT:    stxvd2x vs1, r3, r4
-; CHECK-P8-NEXT:    xxswapd vs5, v6
 ; CHECK-P8-NEXT:    li r4, 80
+; CHECK-P8-NEXT:    xxswapd vs0, v1
+; CHECK-P8-NEXT:    xxswapd vs5, v6
 ; CHECK-P8-NEXT:    xxswapd vs1, v7
 ; CHECK-P8-NEXT:    stxvd2x vs2, r3, r4
 ; CHECK-P8-NEXT:    stxvd2x vs0, r3, r8
@@ -705,122 +523,82 @@ define void @test16elt_signed(<16 x i64>* noalias nocapture sret %agg.result, <1
 ;
 ; CHECK-P9-LABEL: test16elt_signed:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    lxv vs4, 16(r4)
-; CHECK-P9-NEXT:    xxsldwi vs5, vs4, vs4, 3
-; CHECK-P9-NEXT:    xxswapd vs6, vs4
-; CHECK-P9-NEXT:    lxv vs0, 0(r4)
-; CHECK-P9-NEXT:    xxsldwi vs1, vs0, vs0, 3
-; CHECK-P9-NEXT:    xxswapd vs2, vs0
-; CHECK-P9-NEXT:    xscvspdpn f5, vs5
-; CHECK-P9-NEXT:    xscvspdpn f6, vs6
-; CHECK-P9-NEXT:    xxmrghd vs5, vs6, vs5
-; CHECK-P9-NEXT:    xscvspdpn f6, vs4
-; CHECK-P9-NEXT:    xxsldwi vs4, vs4, vs4, 1
-; CHECK-P9-NEXT:    lxv vs3, 32(r4)
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xxswapd vs7, vs3
-; CHECK-P9-NEXT:    xscvspdpn f7, vs7
-; CHECK-P9-NEXT:    xscvspdpn f4, vs4
-; CHECK-P9-NEXT:    xscvspdpn f1, vs1
-; CHECK-P9-NEXT:    xxmrghd vs1, vs2, vs1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs0
-; CHECK-P9-NEXT:    xxsldwi vs0, vs0, vs0, 1
-; CHECK-P9-NEXT:    xscvspdpn f0, vs0
-; CHECK-P9-NEXT:    xxmrghd vs0, vs2, vs0
-; CHECK-P9-NEXT:    xxmrghd vs4, vs6, vs4
-; CHECK-P9-NEXT:    xxsldwi vs6, vs3, vs3, 3
+; CHECK-P9-NEXT:    lxv vs0, 48(r4)
+; CHECK-P9-NEXT:    lxv vs1, 0(r4)
+; CHECK-P9-NEXT:    lxv vs3, 16(r4)
+; CHECK-P9-NEXT:    lxv vs5, 32(r4)
+; CHECK-P9-NEXT:    xxmrglw vs2, vs1, vs1
+; CHECK-P9-NEXT:    xxmrghw vs1, vs1, vs1
+; CHECK-P9-NEXT:    xxmrglw vs4, vs3, vs3
+; CHECK-P9-NEXT:    xxmrghw vs3, vs3, vs3
+; CHECK-P9-NEXT:    xxmrglw vs6, vs5, vs5
+; CHECK-P9-NEXT:    xxmrghw vs5, vs5, vs5
+; CHECK-P9-NEXT:    xxmrglw vs7, vs0, vs0
+; CHECK-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-P9-NEXT:    xvcvspdp vs2, vs2
+; CHECK-P9-NEXT:    xvcvspdp vs1, vs1
+; CHECK-P9-NEXT:    xvcvspdp vs4, vs4
+; CHECK-P9-NEXT:    xvcvspdp vs3, vs3
+; CHECK-P9-NEXT:    xvcvspdp vs6, vs6
+; CHECK-P9-NEXT:    xvcvspdp vs5, vs5
+; CHECK-P9-NEXT:    xvcvspdp vs7, vs7
+; CHECK-P9-NEXT:    xvcvspdp vs0, vs0
+; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-P9-NEXT:    xvcvdpuxds vs5, vs5
-; CHECK-P9-NEXT:    xscvspdpn f6, vs6
-; CHECK-P9-NEXT:    xxmrghd vs6, vs7, vs6
-; CHECK-P9-NEXT:    xscvspdpn f7, vs3
-; CHECK-P9-NEXT:    xxsldwi vs3, vs3, vs3, 1
-; CHECK-P9-NEXT:    lxv vs2, 48(r4)
-; CHECK-P9-NEXT:    xxswapd vs8, vs2
-; CHECK-P9-NEXT:    xscvspdpn f8, vs8
 ; CHECK-P9-NEXT:    xvcvdpuxds vs4, vs4
-; CHECK-P9-NEXT:    xscvspdpn f3, vs3
-; CHECK-P9-NEXT:    xxmrghd vs3, vs7, vs3
-; CHECK-P9-NEXT:    xxsldwi vs7, vs2, vs2, 3
-; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-P9-NEXT:    xvcvdpuxds vs6, vs6
-; CHECK-P9-NEXT:    stxv vs6, 64(r3)
-; CHECK-P9-NEXT:    xscvspdpn f7, vs7
-; CHECK-P9-NEXT:    xxmrghd vs7, vs8, vs7
-; CHECK-P9-NEXT:    xscvspdpn f8, vs2
-; CHECK-P9-NEXT:    xxsldwi vs2, vs2, vs2, 1
-; CHECK-P9-NEXT:    xscvspdpn f2, vs2
-; CHECK-P9-NEXT:    xxmrghd vs2, vs8, vs2
 ; CHECK-P9-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-P9-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-P9-NEXT:    xvcvdpuxds vs5, vs5
 ; CHECK-P9-NEXT:    xvcvdpuxds vs7, vs7
-; CHECK-P9-NEXT:    stxv vs3, 80(r3)
-; CHECK-P9-NEXT:    xvcvdpuxds vs2, vs2
+; CHECK-P9-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-P9-NEXT:    stxv vs0, 112(r3)
 ; CHECK-P9-NEXT:    stxv vs7, 96(r3)
-; CHECK-P9-NEXT:    stxv vs2, 112(r3)
-; CHECK-P9-NEXT:    stxv vs4, 48(r3)
-; CHECK-P9-NEXT:    stxv vs5, 32(r3)
-; CHECK-P9-NEXT:    stxv vs0, 16(r3)
-; CHECK-P9-NEXT:    stxv vs1, 0(r3)
+; CHECK-P9-NEXT:    stxv vs5, 80(r3)
+; CHECK-P9-NEXT:    stxv vs6, 64(r3)
+; CHECK-P9-NEXT:    stxv vs3, 48(r3)
+; CHECK-P9-NEXT:    stxv vs4, 32(r3)
+; CHECK-P9-NEXT:    stxv vs1, 16(r3)
+; CHECK-P9-NEXT:    stxv vs2, 0(r3)
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test16elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv vs4, 16(r4)
-; CHECK-BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
-; CHECK-BE-NEXT:    xscvspdpn f1, vs0
-; CHECK-BE-NEXT:    xxsldwi vs5, vs0, vs0, 3
-; CHECK-BE-NEXT:    xxswapd vs0, vs0
-; CHECK-BE-NEXT:    xscvspdpn f5, vs5
-; CHECK-BE-NEXT:    xscvspdpn f0, vs0
-; CHECK-BE-NEXT:    xxsldwi vs6, vs4, vs4, 1
-; CHECK-BE-NEXT:    xscvspdpn f6, vs6
-; CHECK-BE-NEXT:    xxmrghd vs0, vs0, vs5
-; CHECK-BE-NEXT:    xscvspdpn f5, vs4
-; CHECK-BE-NEXT:    lxv vs3, 32(r4)
-; CHECK-BE-NEXT:    xxsldwi vs7, vs3, vs3, 1
-; CHECK-BE-NEXT:    xscvspdpn f7, vs7
-; CHECK-BE-NEXT:    xxmrghd vs5, vs5, vs6
-; CHECK-BE-NEXT:    xxsldwi vs6, vs4, vs4, 3
-; CHECK-BE-NEXT:    xxswapd vs4, vs4
-; CHECK-BE-NEXT:    xscvspdpn f6, vs6
-; CHECK-BE-NEXT:    xscvspdpn f4, vs4
-; CHECK-BE-NEXT:    xscvspdpn f2, vs2
-; CHECK-BE-NEXT:    xxmrghd vs1, vs1, vs2
-; CHECK-BE-NEXT:    lxv vs2, 48(r4)
-; CHECK-BE-NEXT:    xxsldwi vs8, vs2, vs2, 1
+; CHECK-BE-NEXT:    lxv vs0, 48(r4)
+; CHECK-BE-NEXT:    lxv vs1, 0(r4)
+; CHECK-BE-NEXT:    lxv vs3, 16(r4)
+; CHECK-BE-NEXT:    lxv vs5, 32(r4)
+; CHECK-BE-NEXT:    xxmrghw vs2, vs1, vs1
+; CHECK-BE-NEXT:    xxmrglw vs1, vs1, vs1
+; CHECK-BE-NEXT:    xxmrghw vs4, vs3, vs3
+; CHECK-BE-NEXT:    xxmrglw vs3, vs3, vs3
+; CHECK-BE-NEXT:    xxmrghw vs6, vs5, vs5
+; CHECK-BE-NEXT:    xxmrglw vs5, vs5, vs5
+; CHECK-BE-NEXT:    xxmrghw vs7, vs0, vs0
+; CHECK-BE-NEXT:    xxmrglw vs0, vs0, vs0
+; CHECK-BE-NEXT:    xvcvspdp vs2, vs2
+; CHECK-BE-NEXT:    xvcvspdp vs1, vs1
+; CHECK-BE-NEXT:    xvcvspdp vs4, vs4
+; CHECK-BE-NEXT:    xvcvspdp vs3, vs3
+; CHECK-BE-NEXT:    xvcvspdp vs6, vs6
+; CHECK-BE-NEXT:    xvcvspdp vs5, vs5
+; CHECK-BE-NEXT:    xvcvspdp vs7, vs7
+; CHECK-BE-NEXT:    xvcvspdp vs0, vs0
+; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
 ; CHECK-BE-NEXT:    xvcvdpuxds vs1, vs1
-; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
-; CHECK-BE-NEXT:    xvcvdpuxds vs5, vs5
-; CHECK-BE-NEXT:    xscvspdpn f8, vs8
-; CHECK-BE-NEXT:    xxmrghd vs4, vs4, vs6
-; CHECK-BE-NEXT:    xscvspdpn f6, vs3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
-; CHECK-BE-NEXT:    xxmrghd vs6, vs6, vs7
-; CHECK-BE-NEXT:    xxsldwi vs7, vs3, vs3, 3
-; CHECK-BE-NEXT:    xxswapd vs3, vs3
-; CHECK-BE-NEXT:    xscvspdpn f7, vs7
-; CHECK-BE-NEXT:    xscvspdpn f3, vs3
-; CHECK-BE-NEXT:    xxmrghd vs3, vs3, vs7
-; CHECK-BE-NEXT:    xscvspdpn f7, vs2
-; CHECK-BE-NEXT:    xxmrghd vs7, vs7, vs8
-; CHECK-BE-NEXT:    xxsldwi vs8, vs2, vs2, 3
-; CHECK-BE-NEXT:    xxswapd vs2, vs2
-; CHECK-BE-NEXT:    xscvspdpn f8, vs8
-; CHECK-BE-NEXT:    xscvspdpn f2, vs2
-; CHECK-BE-NEXT:    xxmrghd vs2, vs2, vs8
-; CHECK-BE-NEXT:    stxv vs5, 32(r3)
 ; CHECK-BE-NEXT:    xvcvdpuxds vs4, vs4
-; CHECK-BE-NEXT:    xvcvdpuxds vs6, vs6
 ; CHECK-BE-NEXT:    xvcvdpuxds vs3, vs3
+; CHECK-BE-NEXT:    xvcvdpuxds vs6, vs6
+; CHECK-BE-NEXT:    xvcvdpuxds vs5, vs5
 ; CHECK-BE-NEXT:    xvcvdpuxds vs7, vs7
-; CHECK-BE-NEXT:    stxv vs3, 80(r3)
+; CHECK-BE-NEXT:    xvcvdpuxds vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs7, 96(r3)
-; CHECK-BE-NEXT:    xvcvdpuxds vs2, vs2
-; CHECK-BE-NEXT:    stxv vs2, 112(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
 ; CHECK-BE-NEXT:    stxv vs6, 64(r3)
-; CHECK-BE-NEXT:    stxv vs4, 48(r3)
-; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    stxv vs4, 32(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    stxv vs2, 0(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %a = load <16 x float>, <16 x float>* %0, align 64

diff  --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index 9923cb02cc8d..7cf43a92a5dc 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -1554,11 +1554,8 @@ define <2 x i64> @test46(<2 x float> %a) {
 ;
 ; CHECK-LE-LABEL: test46:
 ; CHECK-LE:       # %bb.0:
-; CHECK-LE-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-LE-NEXT:    xxswapd vs1, v2
-; CHECK-LE-NEXT:    xscvspdpn f0, vs0
-; CHECK-LE-NEXT:    xscvspdpn f1, vs1
-; CHECK-LE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-LE-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-LE-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-LE-NEXT:    xvcvdpuxds v2, vs0
 ; CHECK-LE-NEXT:    blr
   %v = fptoui <2 x float> %a to <2 x i64>
@@ -1625,11 +1622,8 @@ define <2 x i64> @test47(<2 x float> %a) {
 ;
 ; CHECK-LE-LABEL: test47:
 ; CHECK-LE:       # %bb.0:
-; CHECK-LE-NEXT:    xxsldwi vs0, v2, v2, 3
-; CHECK-LE-NEXT:    xxswapd vs1, v2
-; CHECK-LE-NEXT:    xscvspdpn f0, vs0
-; CHECK-LE-NEXT:    xscvspdpn f1, vs1
-; CHECK-LE-NEXT:    xxmrghd vs0, vs1, vs0
+; CHECK-LE-NEXT:    xxmrglw vs0, v2, v2
+; CHECK-LE-NEXT:    xvcvspdp vs0, vs0
 ; CHECK-LE-NEXT:    xvcvdpsxds v2, vs0
 ; CHECK-LE-NEXT:    blr
   %v = fptosi <2 x float> %a to <2 x i64>


        


More information about the llvm-commits mailing list