[llvm] 300e129 - [PowerPC] Disable perfect shuffle by default
Qiu Chaofan via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 15 01:01:59 PDT 2022
Author: Qiu Chaofan
Date: 2022-03-15T15:52:24+08:00
New Revision: 300e1293de635adbe651030de5c8ebd3263458b2
URL: https://github.com/llvm/llvm-project/commit/300e1293de635adbe651030de5c8ebd3263458b2
DIFF: https://github.com/llvm/llvm-project/commit/300e1293de635adbe651030de5c8ebd3263458b2.diff
LOG: [PowerPC] Disable perfect shuffle by default
We are going to remove the old 'perfect shuffle' optimization since it
brings performance penalty in hot loop around vectors. For example, in
following loop sharing the same mask:
%v.1 = shufflevector ... <0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27>
%v.2 = shufflevector ... <0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27>
The generated instructions will be `vmrglw-vmrghw-vmrglw-vmrghw` instead
of `vperm-vperm`. In some large loop cases, this causes 20%+ performance
penalty.
The original attempt to resolve this is to pre-record masks of every
shufflevector operation in DAG, but that is somewhat complex and brings
unnecessary computation (to scan all nodes) in optimization. Here we
disable it by default. There're indeed some cases becoming worse after
this, which will be fixed in a more careful way in future patches.
Reviewed By: jsji
Differential Revision: https://reviews.llvm.org/D121082
Added:
Modified:
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll
llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
llvm/test/CodeGen/PowerPC/extract-and-store.ll
llvm/test/CodeGen/PowerPC/load-and-splat.ll
llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll
llvm/test/CodeGen/PowerPC/pr27078.ll
llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll
llvm/test/CodeGen/PowerPC/test-vector-insert.ll
llvm/test/CodeGen/PowerPC/vec_extract_p9.ll
llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll
llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1d40a64ddaf0b..885b474a72376 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -129,7 +129,7 @@ static cl::opt<bool> EnableQuadwordAtomics(
static cl::opt<bool>
DisablePerfectShuffle("ppc-disable-perfect-shuffle",
cl::desc("disable vector permute decomposition"),
- cl::init(false), cl::Hidden);
+ cl::init(true), cl::Hidden);
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
diff --git a/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll b/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll
index 4f592109e706b..a5d4ce9ef247f 100644
--- a/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll
+++ b/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll
@@ -1,5 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | grep vsldoi
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | not grep vor
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | grep vsldoi
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | not grep vor
+
+; TODO: Fix this case when disabling perfect shuffle
define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) {
%tmp76 = shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 > ; <<4 x float>> [#uses=1]
diff --git a/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll b/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
index 8586eb351ceff..c2ff6a6c8ab75 100644
--- a/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
@@ -1447,16 +1447,16 @@ entry:
define <4 x float> @testSameVecEl0LE(<4 x float> %a) {
; CHECK-64-LABEL: testSameVecEl0LE:
; CHECK-64: # %bb.0: # %entry
-; CHECK-64-NEXT: xxspltw 0, 34, 2
-; CHECK-64-NEXT: xxsldwi 0, 34, 0, 1
-; CHECK-64-NEXT: xxsldwi 34, 0, 0, 3
+; CHECK-64-NEXT: ld 3, L..C0(2) # %const.0
+; CHECK-64-NEXT: lxv 35, 0(3)
+; CHECK-64-NEXT: vperm 2, 2, 2, 3
; CHECK-64-NEXT: blr
;
; CHECK-32-LABEL: testSameVecEl0LE:
; CHECK-32: # %bb.0: # %entry
-; CHECK-32-NEXT: xxspltw 0, 34, 2
-; CHECK-32-NEXT: xxsldwi 0, 34, 0, 1
-; CHECK-32-NEXT: xxsldwi 34, 0, 0, 3
+; CHECK-32-NEXT: lwz 3, L..C0(2) # %const.0
+; CHECK-32-NEXT: lxv 35, 0(3)
+; CHECK-32-NEXT: vperm 2, 2, 2, 3
; CHECK-32-NEXT: blr
entry:
%vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
@@ -1465,16 +1465,16 @@ entry:
define <4 x float> @testSameVecEl1LE(<4 x float> %a) {
; CHECK-64-LABEL: testSameVecEl1LE:
; CHECK-64: # %bb.0: # %entry
-; CHECK-64-NEXT: xxswapd 0, 34
-; CHECK-64-NEXT: xxmrghw 1, 34, 0
-; CHECK-64-NEXT: xxmrghw 34, 1, 0
+; CHECK-64-NEXT: ld 3, L..C1(2) # %const.0
+; CHECK-64-NEXT: lxv 35, 0(3)
+; CHECK-64-NEXT: vperm 2, 2, 2, 3
; CHECK-64-NEXT: blr
;
; CHECK-32-LABEL: testSameVecEl1LE:
; CHECK-32: # %bb.0: # %entry
-; CHECK-32-NEXT: xxswapd 0, 34
-; CHECK-32-NEXT: xxmrghw 1, 34, 0
-; CHECK-32-NEXT: xxmrghw 34, 1, 0
+; CHECK-32-NEXT: lwz 3, L..C1(2) # %const.0
+; CHECK-32-NEXT: lxv 35, 0(3)
+; CHECK-32-NEXT: vperm 2, 2, 2, 3
; CHECK-32-NEXT: blr
entry:
%vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
@@ -1483,16 +1483,16 @@ entry:
define <4 x float> @testSameVecEl3LE(<4 x float> %a) {
; CHECK-64-LABEL: testSameVecEl3LE:
; CHECK-64: # %bb.0: # %entry
-; CHECK-64-NEXT: xxspltw 0, 34, 2
-; CHECK-64-NEXT: xxswapd 1, 34
-; CHECK-64-NEXT: xxsldwi 34, 1, 0, 2
+; CHECK-64-NEXT: ld 3, L..C2(2) # %const.0
+; CHECK-64-NEXT: lxv 35, 0(3)
+; CHECK-64-NEXT: vperm 2, 2, 2, 3
; CHECK-64-NEXT: blr
;
; CHECK-32-LABEL: testSameVecEl3LE:
; CHECK-32: # %bb.0: # %entry
-; CHECK-32-NEXT: xxspltw 0, 34, 2
-; CHECK-32-NEXT: xxswapd 1, 34
-; CHECK-32-NEXT: xxsldwi 34, 1, 0, 2
+; CHECK-32-NEXT: lwz 3, L..C2(2) # %const.0
+; CHECK-32-NEXT: lxv 35, 0(3)
+; CHECK-32-NEXT: vperm 2, 2, 2, 3
; CHECK-32-NEXT: blr
entry:
%vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
index 8c9e7da82af61..055ff8af07024 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
@@ -30,12 +30,13 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: lxvw4x 35, 0, 3
; CHECK-AIX-NEXT: addi 3, 1, -16
; CHECK-AIX-NEXT: lxvw4x 36, 0, 3
+; CHECK-AIX-NEXT: ld 3, L..C0(2) # %const.0
; CHECK-AIX-NEXT: vmrghh 3, 2, 3
-; CHECK-AIX-NEXT: vsplth 5, 2, 0
-; CHECK-AIX-NEXT: vmrghh 2, 4, 2
-; CHECK-AIX-NEXT: xxmrghw 35, 35, 37
-; CHECK-AIX-NEXT: xxswapd 0, 35
-; CHECK-AIX-NEXT: xxsldwi 34, 0, 34, 2
+; CHECK-AIX-NEXT: vmrghh 4, 4, 2
+; CHECK-AIX-NEXT: vsplth 2, 2, 0
+; CHECK-AIX-NEXT: xxmrghw 34, 35, 34
+; CHECK-AIX-NEXT: lxvw4x 35, 0, 3
+; CHECK-AIX-NEXT: vperm 2, 2, 4, 3
; CHECK-AIX-NEXT: vsplth 3, 2, 1
; CHECK-AIX-NEXT: vsplth 2, 2, 4
; CHECK-AIX-NEXT: stxvw4x 35, 0, 5
diff --git a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
index 92e888a996520..f1795332419d9 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
+++ b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
@@ -55,15 +55,15 @@ entry:
define <2 x i64> @buildl(i64 %a) {
; CHECK-LABEL: buildl:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lwz 5, L..C0(2) # %const.0
; CHECK-NEXT: stw 4, -16(1)
; CHECK-NEXT: stw 3, -32(1)
; CHECK-NEXT: addi 3, 1, -16
; CHECK-NEXT: addi 4, 1, -32
-; CHECK-NEXT: lxvw4x 0, 0, 3
-; CHECK-NEXT: lxvw4x 1, 0, 4
-; CHECK-NEXT: xxmrghw 34, 1, 0
-; CHECK-NEXT: xxswapd 0, 34
-; CHECK-NEXT: xxsldwi 34, 0, 34, 2
+; CHECK-NEXT: lxvw4x 35, 0, 3
+; CHECK-NEXT: lxvw4x 36, 0, 4
+; CHECK-NEXT: lxvw4x 34, 0, 5
+; CHECK-NEXT: vperm 2, 4, 3, 2
; CHECK-NEXT: blr
entry:
%splat.splatinsert = insertelement <2 x i64> undef, i64 %a, i32 0
@@ -90,7 +90,7 @@ entry:
define <2 x double> @buildd() {
; CHECK-LABEL: buildd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lwz 3, L..C0(2) # @d
+; CHECK-NEXT: lwz 3, L..C1(2) # @d
; CHECK-NEXT: lxvdsx 34, 0, 3
; CHECK-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/extract-and-store.ll b/llvm/test/CodeGen/PowerPC/extract-and-store.ll
index d03a35fd72805..ae680115abc65 100644
--- a/llvm/test/CodeGen/PowerPC/extract-and-store.ll
+++ b/llvm/test/CodeGen/PowerPC/extract-and-store.ll
@@ -584,14 +584,16 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
;
; CHECK-BE-LABEL: test_stores_exceed_vec_size:
; CHECK-BE: # %bb.0: # %entry
-; CHECK-BE-NEXT: xxspltw vs0, vs34, 0
-; CHECK-BE-NEXT: xxsldwi vs1, vs34, vs34, 1
-; CHECK-BE-NEXT: li r3, 16
+; CHECK-BE-NEXT: addis r3, r2, .LCPI16_0 at toc@ha
+; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-BE-NEXT: li r4, 20
+; CHECK-BE-NEXT: addi r3, r3, .LCPI16_0 at toc@l
+; CHECK-BE-NEXT: lxvw4x vs35, 0, r3
+; CHECK-BE-NEXT: li r3, 16
; CHECK-BE-NEXT: stxsiwx vs34, r5, r3
-; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
-; CHECK-BE-NEXT: stfiwx f1, r5, r4
-; CHECK-BE-NEXT: stxvw4x vs0, 0, r5
+; CHECK-BE-NEXT: stfiwx f0, r5, r4
+; CHECK-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-BE-NEXT: stxvw4x vs35, 0, r5
; CHECK-BE-NEXT: blr
;
; CHECK-P9-LABEL: test_stores_exceed_vec_size:
@@ -610,14 +612,16 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
;
; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size:
; CHECK-P9-BE: # %bb.0: # %entry
-; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0
+; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI16_0 at toc@ha
+; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
+; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI16_0 at toc@l
+; CHECK-P9-BE-NEXT: lxv vs35, 0(r3)
; CHECK-P9-BE-NEXT: li r3, 16
; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3
; CHECK-P9-BE-NEXT: li r3, 20
-; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
-; CHECK-P9-BE-NEXT: stxv vs0, 0(r5)
-; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3
+; CHECK-P9-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-P9-BE-NEXT: stxv vs35, 0(r5)
; CHECK-P9-BE-NEXT: blr
entry:
%vecext = extractelement <4 x i32> %a, i32 2
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index 88dcfe3b78fe1..91a19f58bb8ec 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -211,45 +211,45 @@ define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a
; P9-AIX32-NEXT: lwz r5, 24(r4)
; P9-AIX32-NEXT: lwz r4, 28(r4)
; P9-AIX32-NEXT: stw r4, -16(r1)
+; P9-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0
; P9-AIX32-NEXT: stw r5, -32(r1)
-; P9-AIX32-NEXT: lxv vs0, -16(r1)
-; P9-AIX32-NEXT: lxv vs1, -32(r1)
-; P9-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P9-AIX32-NEXT: xxswapd vs0, v2
-; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P9-AIX32-NEXT: stxv vs0, 0(r3)
+; P9-AIX32-NEXT: lxv v3, -16(r1)
+; P9-AIX32-NEXT: lxv v4, -32(r1)
+; P9-AIX32-NEXT: lxv v2, 0(r4)
+; P9-AIX32-NEXT: vperm v2, v4, v3, v2
+; P9-AIX32-NEXT: stxv v2, 0(r3)
; P9-AIX32-NEXT: blr
;
; P8-AIX32-LABEL: test4:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lwz r5, 24(r4)
-; P8-AIX32-NEXT: lwz r4, 28(r4)
-; P8-AIX32-NEXT: stw r4, -16(r1)
-; P8-AIX32-NEXT: stw r5, -32(r1)
+; P8-AIX32-NEXT: lwz r5, L..C0(r2) # %const.0
+; P8-AIX32-NEXT: lwz r6, 28(r4)
+; P8-AIX32-NEXT: lwz r4, 24(r4)
+; P8-AIX32-NEXT: stw r6, -16(r1)
+; P8-AIX32-NEXT: stw r4, -32(r1)
; P8-AIX32-NEXT: addi r4, r1, -16
+; P8-AIX32-NEXT: lxvw4x v2, 0, r5
; P8-AIX32-NEXT: addi r5, r1, -32
-; P8-AIX32-NEXT: lxvw4x vs0, 0, r4
-; P8-AIX32-NEXT: lxvw4x vs1, 0, r5
-; P8-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P8-AIX32-NEXT: xxswapd vs0, v2
-; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P8-AIX32-NEXT: stxvw4x vs0, 0, r3
+; P8-AIX32-NEXT: lxvw4x v3, 0, r4
+; P8-AIX32-NEXT: lxvw4x v4, 0, r5
+; P8-AIX32-NEXT: vperm v2, v4, v3, v2
+; P8-AIX32-NEXT: stxvw4x v2, 0, r3
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test4:
; P7-AIX32: # %bb.0: # %entry
-; P7-AIX32-NEXT: lwz r6, 28(r4)
-; P7-AIX32-NEXT: lwz r4, 24(r4)
-; P7-AIX32-NEXT: addi r5, r1, -16
-; P7-AIX32-NEXT: stw r6, -16(r1)
-; P7-AIX32-NEXT: stw r4, -32(r1)
-; P7-AIX32-NEXT: addi r4, r1, -32
-; P7-AIX32-NEXT: lxvw4x vs0, 0, r5
-; P7-AIX32-NEXT: lxvw4x vs1, 0, r4
-; P7-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P7-AIX32-NEXT: xxswapd vs0, v2
-; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P7-AIX32-NEXT: stxvw4x vs0, 0, r3
+; P7-AIX32-NEXT: lwz r5, L..C0(r2) # %const.0
+; P7-AIX32-NEXT: lwz r6, 24(r4)
+; P7-AIX32-NEXT: lwz r4, 28(r4)
+; P7-AIX32-NEXT: stw r4, -16(r1)
+; P7-AIX32-NEXT: stw r6, -32(r1)
+; P7-AIX32-NEXT: lxvw4x v2, 0, r5
+; P7-AIX32-NEXT: addi r4, r1, -16
+; P7-AIX32-NEXT: addi r5, r1, -32
+; P7-AIX32-NEXT: lxvw4x v3, 0, r4
+; P7-AIX32-NEXT: lxvw4x v4, 0, r5
+; P7-AIX32-NEXT: vperm v2, v4, v3, v2
+; P7-AIX32-NEXT: stxvw4x v2, 0, r3
; P7-AIX32-NEXT: blr
entry:
%arrayidx = getelementptr inbounds i64, i64* %a, i64 3
@@ -288,45 +288,45 @@ define void @test5(<2 x i64>* %a, i32* %in) {
; P9-AIX32-NEXT: lwz r4, 0(r4)
; P9-AIX32-NEXT: srawi r5, r4, 31
; P9-AIX32-NEXT: stw r4, -16(r1)
-; P9-AIX32-NEXT: lxv vs0, -16(r1)
+; P9-AIX32-NEXT: lwz r4, L..C1(r2) # %const.0
+; P9-AIX32-NEXT: lxv v3, -16(r1)
; P9-AIX32-NEXT: stw r5, -32(r1)
-; P9-AIX32-NEXT: lxv vs1, -32(r1)
-; P9-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P9-AIX32-NEXT: xxswapd vs0, v2
-; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P9-AIX32-NEXT: stxv vs0, 0(r3)
+; P9-AIX32-NEXT: lxv v4, -32(r1)
+; P9-AIX32-NEXT: lxv v2, 0(r4)
+; P9-AIX32-NEXT: vperm v2, v4, v3, v2
+; P9-AIX32-NEXT: stxv v2, 0(r3)
; P9-AIX32-NEXT: blr
;
; P8-AIX32-LABEL: test5:
; P8-AIX32: # %bb.0: # %entry
+; P8-AIX32-NEXT: lwz r5, L..C1(r2) # %const.0
; P8-AIX32-NEXT: lwz r4, 0(r4)
-; P8-AIX32-NEXT: srawi r5, r4, 31
; P8-AIX32-NEXT: stw r4, -16(r1)
+; P8-AIX32-NEXT: srawi r4, r4, 31
+; P8-AIX32-NEXT: stw r4, -32(r1)
+; P8-AIX32-NEXT: lxvw4x v2, 0, r5
; P8-AIX32-NEXT: addi r4, r1, -16
-; P8-AIX32-NEXT: stw r5, -32(r1)
; P8-AIX32-NEXT: addi r5, r1, -32
-; P8-AIX32-NEXT: lxvw4x vs0, 0, r4
-; P8-AIX32-NEXT: lxvw4x vs1, 0, r5
-; P8-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P8-AIX32-NEXT: xxswapd vs0, v2
-; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P8-AIX32-NEXT: stxvw4x vs0, 0, r3
+; P8-AIX32-NEXT: lxvw4x v3, 0, r4
+; P8-AIX32-NEXT: lxvw4x v4, 0, r5
+; P8-AIX32-NEXT: vperm v2, v4, v3, v2
+; P8-AIX32-NEXT: stxvw4x v2, 0, r3
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test5:
; P7-AIX32: # %bb.0: # %entry
; P7-AIX32-NEXT: lwz r4, 0(r4)
-; P7-AIX32-NEXT: addi r5, r1, -16
+; P7-AIX32-NEXT: lwz r5, L..C1(r2) # %const.0
+; P7-AIX32-NEXT: srawi r6, r4, 31
; P7-AIX32-NEXT: stw r4, -16(r1)
-; P7-AIX32-NEXT: srawi r4, r4, 31
-; P7-AIX32-NEXT: stw r4, -32(r1)
-; P7-AIX32-NEXT: addi r4, r1, -32
-; P7-AIX32-NEXT: lxvw4x vs0, 0, r5
-; P7-AIX32-NEXT: lxvw4x vs1, 0, r4
-; P7-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P7-AIX32-NEXT: xxswapd vs0, v2
-; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P7-AIX32-NEXT: stxvw4x vs0, 0, r3
+; P7-AIX32-NEXT: addi r4, r1, -16
+; P7-AIX32-NEXT: stw r6, -32(r1)
+; P7-AIX32-NEXT: lxvw4x v2, 0, r5
+; P7-AIX32-NEXT: addi r5, r1, -32
+; P7-AIX32-NEXT: lxvw4x v3, 0, r4
+; P7-AIX32-NEXT: lxvw4x v4, 0, r5
+; P7-AIX32-NEXT: vperm v2, v4, v3, v2
+; P7-AIX32-NEXT: stxvw4x v2, 0, r3
; P7-AIX32-NEXT: blr
entry:
%0 = load i32, i32* %in, align 4
@@ -365,45 +365,45 @@ define void @test6(<2 x i64>* %a, i32* %in) {
; P9-AIX32-NEXT: lwz r4, 0(r4)
; P9-AIX32-NEXT: li r5, 0
; P9-AIX32-NEXT: stw r5, -32(r1)
-; P9-AIX32-NEXT: lxv vs0, -32(r1)
+; P9-AIX32-NEXT: lxv v3, -32(r1)
; P9-AIX32-NEXT: stw r4, -16(r1)
-; P9-AIX32-NEXT: lxv vs1, -16(r1)
-; P9-AIX32-NEXT: xxmrghw v2, vs0, vs1
-; P9-AIX32-NEXT: xxswapd vs0, v2
-; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P9-AIX32-NEXT: stxv vs0, 0(r3)
+; P9-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0
+; P9-AIX32-NEXT: lxv v4, -16(r1)
+; P9-AIX32-NEXT: lxv v2, 0(r4)
+; P9-AIX32-NEXT: vperm v2, v3, v4, v2
+; P9-AIX32-NEXT: stxv v2, 0(r3)
; P9-AIX32-NEXT: blr
;
; P8-AIX32-LABEL: test6:
; P8-AIX32: # %bb.0: # %entry
+; P8-AIX32-NEXT: lwz r6, L..C2(r2) # %const.0
; P8-AIX32-NEXT: lwz r4, 0(r4)
; P8-AIX32-NEXT: li r5, 0
; P8-AIX32-NEXT: stw r5, -32(r1)
; P8-AIX32-NEXT: addi r5, r1, -16
; P8-AIX32-NEXT: stw r4, -16(r1)
; P8-AIX32-NEXT: addi r4, r1, -32
-; P8-AIX32-NEXT: lxvw4x vs0, 0, r4
-; P8-AIX32-NEXT: lxvw4x vs1, 0, r5
-; P8-AIX32-NEXT: xxmrghw v2, vs0, vs1
-; P8-AIX32-NEXT: xxswapd vs0, v2
-; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P8-AIX32-NEXT: stxvw4x vs0, 0, r3
+; P8-AIX32-NEXT: lxvw4x v2, 0, r6
+; P8-AIX32-NEXT: lxvw4x v3, 0, r4
+; P8-AIX32-NEXT: lxvw4x v4, 0, r5
+; P8-AIX32-NEXT: vperm v2, v3, v4, v2
+; P8-AIX32-NEXT: stxvw4x v2, 0, r3
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test6:
; P7-AIX32: # %bb.0: # %entry
+; P7-AIX32-NEXT: lwz r5, L..C2(r2) # %const.0
; P7-AIX32-NEXT: lwz r4, 0(r4)
-; P7-AIX32-NEXT: li r5, 0
-; P7-AIX32-NEXT: stw r5, -32(r1)
-; P7-AIX32-NEXT: addi r5, r1, -16
+; P7-AIX32-NEXT: li r6, 0
+; P7-AIX32-NEXT: stw r6, -32(r1)
; P7-AIX32-NEXT: stw r4, -16(r1)
; P7-AIX32-NEXT: addi r4, r1, -32
-; P7-AIX32-NEXT: lxvw4x vs0, 0, r4
-; P7-AIX32-NEXT: lxvw4x vs1, 0, r5
-; P7-AIX32-NEXT: xxmrghw v2, vs0, vs1
-; P7-AIX32-NEXT: xxswapd vs0, v2
-; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2
-; P7-AIX32-NEXT: stxvw4x vs0, 0, r3
+; P7-AIX32-NEXT: lxvw4x v2, 0, r5
+; P7-AIX32-NEXT: addi r5, r1, -16
+; P7-AIX32-NEXT: lxvw4x v3, 0, r4
+; P7-AIX32-NEXT: lxvw4x v4, 0, r5
+; P7-AIX32-NEXT: vperm v2, v3, v4, v2
+; P7-AIX32-NEXT: stxvw4x v2, 0, r3
; P7-AIX32-NEXT: blr
entry:
%0 = load i32, i32* %in, align 4
@@ -832,32 +832,34 @@ define <16 x i8> @unadjusted_lxvdsx(i64* %s, i64* %t) {
;
; P8-AIX32-LABEL: unadjusted_lxvdsx:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lwz r4, 4(r3)
-; P8-AIX32-NEXT: stw r4, -32(r1)
-; P8-AIX32-NEXT: addi r4, r1, -16
+; P8-AIX32-NEXT: lwz r5, 4(r3)
+; P8-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0
+; P8-AIX32-NEXT: stw r5, -32(r1)
; P8-AIX32-NEXT: lwz r3, 0(r3)
+; P8-AIX32-NEXT: lxvw4x v2, 0, r4
+; P8-AIX32-NEXT: addi r4, r1, -16
; P8-AIX32-NEXT: stw r3, -16(r1)
; P8-AIX32-NEXT: addi r3, r1, -32
-; P8-AIX32-NEXT: lxvw4x vs0, 0, r3
-; P8-AIX32-NEXT: lxvw4x vs1, 0, r4
-; P8-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P8-AIX32-NEXT: xxsldwi vs0, vs1, v2, 2
-; P8-AIX32-NEXT: xxmrgld v2, vs0, vs0
+; P8-AIX32-NEXT: lxvw4x v3, 0, r3
+; P8-AIX32-NEXT: lxvw4x v4, 0, r4
+; P8-AIX32-NEXT: vperm v2, v4, v3, v2
+; P8-AIX32-NEXT: xxmrghd v2, v2, v2
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: unadjusted_lxvdsx:
; P7-AIX32: # %bb.0: # %entry
; P7-AIX32-NEXT: lwz r5, 4(r3)
-; P7-AIX32-NEXT: addi r4, r1, -32
+; P7-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0
; P7-AIX32-NEXT: stw r5, -32(r1)
; P7-AIX32-NEXT: lwz r3, 0(r3)
+; P7-AIX32-NEXT: lxvw4x v2, 0, r4
+; P7-AIX32-NEXT: addi r4, r1, -16
; P7-AIX32-NEXT: stw r3, -16(r1)
-; P7-AIX32-NEXT: addi r3, r1, -16
-; P7-AIX32-NEXT: lxvw4x vs0, 0, r4
-; P7-AIX32-NEXT: lxvw4x vs1, 0, r3
-; P7-AIX32-NEXT: xxmrghw v2, vs1, vs0
-; P7-AIX32-NEXT: xxsldwi vs0, vs1, v2, 2
-; P7-AIX32-NEXT: xxmrgld v2, vs0, vs0
+; P7-AIX32-NEXT: addi r3, r1, -32
+; P7-AIX32-NEXT: lxvw4x v3, 0, r3
+; P7-AIX32-NEXT: lxvw4x v4, 0, r4
+; P7-AIX32-NEXT: vperm v2, v4, v3, v2
+; P7-AIX32-NEXT: xxmrghd v2, v2, v2
; P7-AIX32-NEXT: blr
entry:
%0 = bitcast i64* %s to <8 x i8>*
diff --git a/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll b/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
index 9838b9558a037..c8cad4bf184f8 100644
--- a/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
+++ b/llvm/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
@@ -1,4 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -ppc-disable-perfect-shuffle=false < %s | FileCheck %s
+
+; TODO: Fix this case when disabling perfect shuffle
+
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
index bf6da83ae7955..7d6117719da1d 100644
--- a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
@@ -1,19 +1,31 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple powerpc64 -mcpu=pwr10 < %s | FileCheck %s --check-prefix=BE
; RUN: llc -mtriple powerpc64le -mcpu=pwr10 < %s | FileCheck %s --check-prefix=LE
+; RUN: llc -mtriple powerpc64le -mcpu=pwr10 -ppc-disable-perfect-shuffle=false < %s | FileCheck %s --check-prefix=LE
+; RUN: llc -mtriple powerpc64 -mcpu=pwr10 -ppc-disable-perfect-shuffle=false < %s | FileCheck %s --check-prefix=BE-ENABLE
+
+; TODO: Fix the worse codegen when disabling perfect shuffle
define <4 x float> @shuffle1(<16 x i8> %v1, <16 x i8> %v2) {
; BE-LABEL: shuffle1:
; BE: # %bb.0:
-; BE-NEXT: xxmrglw 0, 34, 35
-; BE-NEXT: xxmrghw 1, 34, 35
-; BE-NEXT: xxmrghw 34, 1, 0
+; BE-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; BE-NEXT: lxv 36, 0(3)
+; BE-NEXT: vperm 2, 2, 3, 4
; BE-NEXT: blr
;
; LE-LABEL: shuffle1:
; LE: # %bb.0:
; LE-NEXT: vpkudum 2, 3, 2
; LE-NEXT: blr
+;
+; BE-ENABLE-LABEL: shuffle1:
+; BE-ENABLE: # %bb.0:
+; BE-ENABLE-NEXT: xxmrglw 0, 34, 35
+; BE-ENABLE-NEXT: xxmrghw 1, 34, 35
+; BE-ENABLE-NEXT: xxmrghw 34, 1, 0
+; BE-ENABLE-NEXT: blr
%shuf = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
%cast = bitcast <16 x i8> %shuf to <4 x float>
ret <4 x float> %cast
@@ -30,6 +42,11 @@ define <4 x float> @shuffle2(<16 x i8> %v1, <16 x i8> %v2) {
; LE-NEXT: plxv 36, .LCPI1_0 at PCREL(0), 1
; LE-NEXT: vperm 2, 3, 2, 4
; LE-NEXT: blr
+;
+; BE-ENABLE-LABEL: shuffle2:
+; BE-ENABLE: # %bb.0:
+; BE-ENABLE-NEXT: vpkudum 2, 2, 3
+; BE-ENABLE-NEXT: blr
%shuf = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
%cast = bitcast <16 x i8> %shuf to <4 x float>
ret <4 x float> %cast
@@ -38,12 +55,11 @@ define <4 x float> @shuffle2(<16 x i8> %v1, <16 x i8> %v2) {
define <4 x float> @shuffle3(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x i8> %v4) {
; BE-LABEL: shuffle3:
; BE: # %bb.0:
-; BE-NEXT: xxmrglw 0, 34, 35
-; BE-NEXT: xxmrghw 1, 34, 35
-; BE-NEXT: xxmrghw 34, 1, 0
-; BE-NEXT: xxmrglw 0, 36, 37
-; BE-NEXT: xxmrghw 1, 36, 37
-; BE-NEXT: xxmrghw 35, 1, 0
+; BE-NEXT: addis 3, 2, .LCPI2_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_0 at toc@l
+; BE-NEXT: lxv 32, 0(3)
+; BE-NEXT: vperm 2, 2, 3, 0
+; BE-NEXT: vperm 3, 4, 5, 0
; BE-NEXT: xvaddsp 34, 34, 35
; BE-NEXT: blr
;
@@ -53,6 +69,17 @@ define <4 x float> @shuffle3(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x
; LE-NEXT: vpkudum 3, 5, 4
; LE-NEXT: xvaddsp 34, 34, 35
; LE-NEXT: blr
+;
+; BE-ENABLE-LABEL: shuffle3:
+; BE-ENABLE: # %bb.0:
+; BE-ENABLE-NEXT: xxmrglw 0, 34, 35
+; BE-ENABLE-NEXT: xxmrghw 1, 34, 35
+; BE-ENABLE-NEXT: xxmrghw 34, 1, 0
+; BE-ENABLE-NEXT: xxmrglw 0, 36, 37
+; BE-ENABLE-NEXT: xxmrghw 1, 36, 37
+; BE-ENABLE-NEXT: xxmrghw 35, 1, 0
+; BE-ENABLE-NEXT: xvaddsp 34, 34, 35
+; BE-ENABLE-NEXT: blr
%shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
%shuf2 = shufflevector <16 x i8> %v3, <16 x i8> %v4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
%cast1 = bitcast <16 x i8> %shuf1 to <4 x float>
@@ -76,6 +103,13 @@ define <4 x float> @shuffle4(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x
; LE-NEXT: vperm 3, 5, 4, 0
; LE-NEXT: xvaddsp 34, 34, 35
; LE-NEXT: blr
+;
+; BE-ENABLE-LABEL: shuffle4:
+; BE-ENABLE: # %bb.0:
+; BE-ENABLE-NEXT: vpkudum 2, 2, 3
+; BE-ENABLE-NEXT: vpkudum 3, 4, 5
+; BE-ENABLE-NEXT: xvaddsp 34, 34, 35
+; BE-ENABLE-NEXT: blr
%shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
%shuf2 = shufflevector <16 x i8> %v3, <16 x i8> %v4, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
%cast1 = bitcast <16 x i8> %shuf1 to <4 x float>
@@ -87,22 +121,21 @@ define <4 x float> @shuffle4(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x
define <4 x float> @shuffle5(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x i8> %v4) {
; BE-LABEL: shuffle5:
; BE: # %bb.0: # %entry
-; BE-NEXT: xxmrglw 0, 34, 35
-; BE-NEXT: xxmrghw 1, 34, 35
+; BE-NEXT: addis 3, 2, .LCPI4_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI4_0 at toc@l
+; BE-NEXT: lxv 32, 0(3)
; BE-NEXT: li 3, 8
; BE-NEXT: vextublx 3, 3, 2
-; BE-NEXT: xxmrghw 0, 1, 0
; BE-NEXT: andi. 3, 3, 255
-; BE-NEXT: xxlor 1, 0, 0
+; BE-NEXT: vperm 3, 2, 3, 0
+; BE-NEXT: vmr 2, 3
; BE-NEXT: beq 0, .LBB4_2
; BE-NEXT: # %bb.1: # %exit
-; BE-NEXT: xvaddsp 34, 0, 1
+; BE-NEXT: xvaddsp 34, 35, 34
; BE-NEXT: blr
; BE-NEXT: .LBB4_2: # %second
-; BE-NEXT: xxmrglw 1, 36, 37
-; BE-NEXT: xxmrghw 2, 36, 37
-; BE-NEXT: xxmrghw 1, 2, 1
-; BE-NEXT: xvaddsp 34, 0, 1
+; BE-NEXT: vperm 2, 4, 5, 0
+; BE-NEXT: xvaddsp 34, 35, 34
; BE-NEXT: blr
;
; LE-LABEL: shuffle5:
@@ -120,6 +153,26 @@ define <4 x float> @shuffle5(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x
; LE-NEXT: vpkudum 2, 5, 4
; LE-NEXT: xvaddsp 34, 35, 34
; LE-NEXT: blr
+;
+; BE-ENABLE-LABEL: shuffle5:
+; BE-ENABLE: # %bb.0: # %entry
+; BE-ENABLE-NEXT: xxmrglw 0, 34, 35
+; BE-ENABLE-NEXT: xxmrghw 1, 34, 35
+; BE-ENABLE-NEXT: li 3, 8
+; BE-ENABLE-NEXT: vextublx 3, 3, 2
+; BE-ENABLE-NEXT: xxmrghw 0, 1, 0
+; BE-ENABLE-NEXT: andi. 3, 3, 255
+; BE-ENABLE-NEXT: xxlor 1, 0, 0
+; BE-ENABLE-NEXT: beq 0, .LBB4_2
+; BE-ENABLE-NEXT: # %bb.1: # %exit
+; BE-ENABLE-NEXT: xvaddsp 34, 0, 1
+; BE-ENABLE-NEXT: blr
+; BE-ENABLE-NEXT: .LBB4_2: # %second
+; BE-ENABLE-NEXT: xxmrglw 1, 36, 37
+; BE-ENABLE-NEXT: xxmrghw 2, 36, 37
+; BE-ENABLE-NEXT: xxmrghw 1, 2, 1
+; BE-ENABLE-NEXT: xvaddsp 34, 0, 1
+; BE-ENABLE-NEXT: blr
entry:
%shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
%fetch = extractelement <16 x i8> %shuf1, i32 4
diff --git a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll
index dcccfe29182f6..a46b554707a90 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll
@@ -10,26 +10,27 @@ define dso_local fastcc void @BuildVectorICE() unnamed_addr {
; 32BIT: # %bb.0: # %entry
; 32BIT-NEXT: stwu 1, -64(1)
; 32BIT-NEXT: .cfi_def_cfa_offset 64
-; 32BIT-NEXT: lxvw4x 34, 0, 3
-; 32BIT-NEXT: li 3, 0
-; 32BIT-NEXT: addi 4, 1, 16
-; 32BIT-NEXT: addi 5, 1, 32
+; 32BIT-NEXT: li 3, .LCPI0_0 at l
+; 32BIT-NEXT: lis 4, .LCPI0_0 at ha
+; 32BIT-NEXT: addi 5, 1, 16
; 32BIT-NEXT: addi 6, 1, 48
; 32BIT-NEXT: li 7, 0
+; 32BIT-NEXT: lxvw4x 34, 0, 3
+; 32BIT-NEXT: lxvw4x 35, 4, 3
+; 32BIT-NEXT: li 3, 0
+; 32BIT-NEXT: addi 4, 1, 32
; 32BIT-NEXT: .p2align 4
; 32BIT-NEXT: .LBB0_1: # %while.body
; 32BIT-NEXT: #
-; 32BIT-NEXT: stw 7, 16(1)
; 32BIT-NEXT: stw 3, 32(1)
-; 32BIT-NEXT: lxvw4x 0, 0, 4
-; 32BIT-NEXT: lxvw4x 1, 0, 5
-; 32BIT-NEXT: xxsldwi 0, 1, 0, 1
-; 32BIT-NEXT: xxspltw 1, 1, 0
-; 32BIT-NEXT: xxsldwi 35, 0, 1, 3
-; 32BIT-NEXT: vadduwm 3, 2, 3
-; 32BIT-NEXT: xxspltw 36, 35, 1
-; 32BIT-NEXT: vadduwm 3, 3, 4
-; 32BIT-NEXT: stxvw4x 35, 0, 6
+; 32BIT-NEXT: stw 7, 16(1)
+; 32BIT-NEXT: lxvw4x 36, 0, 4
+; 32BIT-NEXT: lxvw4x 37, 0, 5
+; 32BIT-NEXT: vperm 4, 5, 4, 3
+; 32BIT-NEXT: vadduwm 4, 2, 4
+; 32BIT-NEXT: xxspltw 37, 36, 1
+; 32BIT-NEXT: vadduwm 4, 4, 5
+; 32BIT-NEXT: stxvw4x 36, 0, 6
; 32BIT-NEXT: lwz 7, 48(1)
; 32BIT-NEXT: b .LBB0_1
;
diff --git a/llvm/test/CodeGen/PowerPC/pr27078.ll b/llvm/test/CodeGen/PowerPC/pr27078.ll
index 1f89895661ff2..e1532f05ea4e9 100644
--- a/llvm/test/CodeGen/PowerPC/pr27078.ll
+++ b/llvm/test/CodeGen/PowerPC/pr27078.ll
@@ -4,23 +4,26 @@
define <4 x float> @bar(float* %p, float* %q) {
; CHECK-LABEL: bar:
; CHECK: # %bb.0:
-; CHECK-NEXT: lxvw4x 0, 0, 3
-; CHECK-NEXT: lxvw4x 1, 0, 4
; CHECK-NEXT: li 5, 16
-; CHECK-NEXT: lxvw4x 2, 3, 5
-; CHECK-NEXT: lxvw4x 3, 4, 5
+; CHECK-NEXT: lxvw4x 2, 0, 3
+; CHECK-NEXT: lxvw4x 3, 0, 4
+; CHECK-NEXT: lxvw4x 0, 3, 5
+; CHECK-NEXT: lxvw4x 1, 4, 5
; CHECK-NEXT: li 5, 32
-; CHECK-NEXT: lxvw4x 4, 4, 5
+; CHECK-NEXT: xvsubsp 35, 3, 2
+; CHECK-NEXT: xvsubsp 34, 1, 0
+; CHECK-NEXT: lxvw4x 0, 3, 5
+; CHECK-NEXT: lxvw4x 1, 4, 5
+; CHECK-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; CHECK-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; CHECK-NEXT: lxvw4x 36, 0, 3
+; CHECK-NEXT: addis 3, 2, .LCPI0_1 at toc@ha
; CHECK-NEXT: xvsubsp 0, 1, 0
-; CHECK-NEXT: lxvw4x 1, 3, 5
-; CHECK-NEXT: xvsubsp 2, 3, 2
-; CHECK-NEXT: xvsubsp 1, 4, 1
-; CHECK-NEXT: xxsldwi 0, 0, 0, 1
-; CHECK-NEXT: xxmrglw 34, 0, 2
-; CHECK-NEXT: xxsldwi 0, 0, 34, 3
-; CHECK-NEXT: xxmrghw 34, 1, 1
-; CHECK-NEXT: xxsldwi 0, 34, 0, 3
-; CHECK-NEXT: xxsldwi 34, 0, 0, 1
+; CHECK-NEXT: addi 3, 3, .LCPI0_1 at toc@l
+; CHECK-NEXT: vperm 2, 3, 2, 4
+; CHECK-NEXT: lxvw4x 36, 0, 3
+; CHECK-NEXT: xxmrghw 35, 0, 0
+; CHECK-NEXT: vperm 2, 2, 3, 4
; CHECK-NEXT: blr
%1 = bitcast float* %p to <12 x float>*
%2 = bitcast float* %q to <12 x float>*
diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll
index 4e9a374c97523..b4e62073ec29c 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll
@@ -35,10 +35,11 @@ define <4 x i32> @s2v_test1(i32* nocapture readonly %int32, <4 x i32> %vec) {
;
; P8BE-LABEL: s2v_test1:
; P8BE: # %bb.0: # %entry
-; P8BE-NEXT: lfiwzx f0, 0, r3
-; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1
-; P8BE-NEXT: xxmrghw v2, v2, vs0
-; P8BE-NEXT: xxsldwi v2, v2, vs1, 3
+; P8BE-NEXT: addis r4, r2, .LCPI0_0 at toc@ha
+; P8BE-NEXT: lxsiwzx v4, 0, r3
+; P8BE-NEXT: addi r4, r4, .LCPI0_0 at toc@l
+; P8BE-NEXT: lxvw4x v3, 0, r4
+; P8BE-NEXT: vperm v2, v4, v2, v3
; P8BE-NEXT: blr
entry:
%0 = load i32, i32* %int32, align 4
@@ -74,11 +75,12 @@ define <4 x i32> @s2v_test2(i32* nocapture readonly %int32, <4 x i32> %vec) {
;
; P8BE-LABEL: s2v_test2:
; P8BE: # %bb.0: # %entry
+; P8BE-NEXT: addis r4, r2, .LCPI1_0 at toc@ha
; P8BE-NEXT: addi r3, r3, 4
-; P8BE-NEXT: lfiwzx f0, 0, r3
-; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1
-; P8BE-NEXT: xxmrghw v2, v2, vs0
-; P8BE-NEXT: xxsldwi v2, v2, vs1, 3
+; P8BE-NEXT: addi r4, r4, .LCPI1_0 at toc@l
+; P8BE-NEXT: lxsiwzx v4, 0, r3
+; P8BE-NEXT: lxvw4x v3, 0, r4
+; P8BE-NEXT: vperm v2, v4, v2, v3
; P8BE-NEXT: blr
entry:
%arrayidx = getelementptr inbounds i32, i32* %int32, i64 1
@@ -117,11 +119,12 @@ define <4 x i32> @s2v_test3(i32* nocapture readonly %int32, <4 x i32> %vec, i32
;
; P8BE-LABEL: s2v_test3:
; P8BE: # %bb.0: # %entry
-; P8BE-NEXT: sldi r4, r7, 2
-; P8BE-NEXT: lfiwzx f0, r3, r4
-; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1
-; P8BE-NEXT: xxmrghw v2, v2, vs0
-; P8BE-NEXT: xxsldwi v2, v2, vs1, 3
+; P8BE-NEXT: addis r4, r2, .LCPI2_0 at toc@ha
+; P8BE-NEXT: sldi r5, r7, 2
+; P8BE-NEXT: addi r4, r4, .LCPI2_0 at toc@l
+; P8BE-NEXT: lxsiwzx v3, r3, r5
+; P8BE-NEXT: lxvw4x v4, 0, r4
+; P8BE-NEXT: vperm v2, v3, v2, v4
; P8BE-NEXT: blr
entry:
%idxprom = sext i32 %Idx to i64
@@ -159,11 +162,12 @@ define <4 x i32> @s2v_test4(i32* nocapture readonly %int32, <4 x i32> %vec) {
;
; P8BE-LABEL: s2v_test4:
; P8BE: # %bb.0: # %entry
+; P8BE-NEXT: addis r4, r2, .LCPI3_0 at toc@ha
; P8BE-NEXT: addi r3, r3, 4
-; P8BE-NEXT: lfiwzx f0, 0, r3
-; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1
-; P8BE-NEXT: xxmrghw v2, v2, vs0
-; P8BE-NEXT: xxsldwi v2, v2, vs1, 3
+; P8BE-NEXT: addi r4, r4, .LCPI3_0 at toc@l
+; P8BE-NEXT: lxsiwzx v4, 0, r3
+; P8BE-NEXT: lxvw4x v3, 0, r4
+; P8BE-NEXT: vperm v2, v4, v2, v3
; P8BE-NEXT: blr
entry:
%arrayidx = getelementptr inbounds i32, i32* %int32, i64 1
@@ -199,10 +203,11 @@ define <4 x i32> @s2v_test5(<4 x i32> %vec, i32* nocapture readonly %ptr1) {
;
; P8BE-LABEL: s2v_test5:
; P8BE: # %bb.0: # %entry
-; P8BE-NEXT: lfiwzx f0, 0, r5
-; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1
-; P8BE-NEXT: xxmrghw v2, v2, vs0
-; P8BE-NEXT: xxsldwi v2, v2, vs1, 3
+; P8BE-NEXT: addis r3, r2, .LCPI4_0 at toc@ha
+; P8BE-NEXT: lxsiwzx v4, 0, r5
+; P8BE-NEXT: addi r3, r3, .LCPI4_0 at toc@l
+; P8BE-NEXT: lxvw4x v3, 0, r3
+; P8BE-NEXT: vperm v2, v4, v2, v3
; P8BE-NEXT: blr
entry:
%0 = load i32, i32* %ptr1, align 4
@@ -237,10 +242,11 @@ define <4 x float> @s2v_test_f1(float* nocapture readonly %f64, <4 x float> %vec
;
; P8BE-LABEL: s2v_test_f1:
; P8BE: # %bb.0: # %entry
-; P8BE-NEXT: lfiwzx f0, 0, r3
-; P8BE-NEXT: xxsldwi vs1, v2, vs0, 1
-; P8BE-NEXT: xxmrghw v2, v2, vs0
-; P8BE-NEXT: xxsldwi v2, v2, vs1, 3
+; P8BE-NEXT: addis r4, r2, .LCPI5_0 at toc@ha
+; P8BE-NEXT: lxsiwzx v4, 0, r3
+; P8BE-NEXT: addi r4, r4, .LCPI5_0 at toc@l
+; P8BE-NEXT: lxvw4x v3, 0, r4
+; P8BE-NEXT: vperm v2, v4, v2, v3
; P8BE-NEXT: blr
entry:
%0 = load float, float* %f64, align 4
diff --git a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
index 138d96fd53e12..216152c1277fd 100644
--- a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
+++ b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
@@ -55,21 +55,24 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) {
; CHECK-BE-P7: # %bb.0: # %entry
; CHECK-BE-P7-NEXT: xscvdpsxws f0, f1
; CHECK-BE-P7-NEXT: addi r3, r1, -4
+; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI0_0 at toc@ha
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3
; CHECK-BE-P7-NEXT: stw r3, -32(r1)
-; CHECK-BE-P7-NEXT: addi r3, r1, -32
-; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3
-; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1
+; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI0_0 at toc@l
+; CHECK-BE-P7-NEXT: addi r4, r1, -32
+; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
+; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4
+; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-P7-NEXT: blr
;
; CHECK-BE-P8-LABEL: test:
; CHECK-BE-P8: # %bb.0: # %entry
-; CHECK-BE-P8-NEXT: xscvdpsxws f0, f1
-; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0
-; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3
-; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1
+; CHECK-BE-P8-NEXT: xscvdpsxws v3, f1
+; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI0_0 at toc@ha
+; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI0_0 at toc@l
+; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3
+; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4
; CHECK-BE-P8-NEXT: blr
;
; CHECK-BE-P9-LABEL: test:
@@ -118,21 +121,24 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) {
; CHECK-BE-P7: # %bb.0: # %entry
; CHECK-BE-P7-NEXT: xscvdpsxws f0, f1
; CHECK-BE-P7-NEXT: addi r3, r1, -4
+; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI1_0 at toc@ha
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3
; CHECK-BE-P7-NEXT: stw r3, -32(r1)
-; CHECK-BE-P7-NEXT: addi r3, r1, -32
-; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3
-; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1
+; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI1_0 at toc@l
+; CHECK-BE-P7-NEXT: addi r4, r1, -32
+; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
+; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4
+; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-P7-NEXT: blr
;
; CHECK-BE-P8-LABEL: test2:
; CHECK-BE-P8: # %bb.0: # %entry
-; CHECK-BE-P8-NEXT: xscvdpsxws f0, f1
-; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0
-; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3
-; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1
+; CHECK-BE-P8-NEXT: xscvdpsxws v3, f1
+; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI1_0 at toc@ha
+; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI1_0 at toc@l
+; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3
+; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4
; CHECK-BE-P8-NEXT: blr
;
; CHECK-BE-P9-LABEL: test2:
@@ -181,21 +187,24 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) {
; CHECK-BE-P7: # %bb.0: # %entry
; CHECK-BE-P7-NEXT: xscvdpuxws f0, f1
; CHECK-BE-P7-NEXT: addi r3, r1, -4
+; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI2_0 at toc@ha
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3
; CHECK-BE-P7-NEXT: stw r3, -32(r1)
-; CHECK-BE-P7-NEXT: addi r3, r1, -32
-; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3
-; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1
+; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI2_0 at toc@l
+; CHECK-BE-P7-NEXT: addi r4, r1, -32
+; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
+; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4
+; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-P7-NEXT: blr
;
; CHECK-BE-P8-LABEL: test3:
; CHECK-BE-P8: # %bb.0: # %entry
-; CHECK-BE-P8-NEXT: xscvdpuxws f0, f1
-; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0
-; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3
-; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1
+; CHECK-BE-P8-NEXT: xscvdpuxws v3, f1
+; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI2_0 at toc@ha
+; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI2_0 at toc@l
+; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3
+; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4
; CHECK-BE-P8-NEXT: blr
;
; CHECK-BE-P9-LABEL: test3:
@@ -244,21 +253,24 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) {
; CHECK-BE-P7: # %bb.0: # %entry
; CHECK-BE-P7-NEXT: xscvdpuxws f0, f1
; CHECK-BE-P7-NEXT: addi r3, r1, -4
+; CHECK-BE-P7-NEXT: addis r4, r2, .LCPI3_0 at toc@ha
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: xxsldwi vs0, v2, v2, 3
; CHECK-BE-P7-NEXT: stw r3, -32(r1)
-; CHECK-BE-P7-NEXT: addi r3, r1, -32
-; CHECK-BE-P7-NEXT: lxvw4x vs1, 0, r3
-; CHECK-BE-P7-NEXT: xxsldwi v2, vs0, vs1, 1
+; CHECK-BE-P7-NEXT: addi r3, r4, .LCPI3_0 at toc@l
+; CHECK-BE-P7-NEXT: addi r4, r1, -32
+; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
+; CHECK-BE-P7-NEXT: lxvw4x v4, 0, r4
+; CHECK-BE-P7-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-P7-NEXT: blr
;
; CHECK-BE-P8-LABEL: test4:
; CHECK-BE-P8: # %bb.0: # %entry
-; CHECK-BE-P8-NEXT: xscvdpuxws f0, f1
-; CHECK-BE-P8-NEXT: xxmrghw v3, v2, vs0
-; CHECK-BE-P8-NEXT: xxsldwi vs0, v3, v2, 3
-; CHECK-BE-P8-NEXT: xxsldwi v2, vs0, vs0, 1
+; CHECK-BE-P8-NEXT: xscvdpuxws v3, f1
+; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI3_0 at toc@ha
+; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI3_0 at toc@l
+; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3
+; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4
; CHECK-BE-P8-NEXT: blr
;
; CHECK-BE-P9-LABEL: test4:
diff --git a/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll b/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll
index e66851358d1e3..f31d1cfb7a288 100644
--- a/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_extract_p9.ll
@@ -190,10 +190,12 @@ define double @test10(<4 x i32> %a, <4 x i32> %b) {
; CHECK-BE-LABEL: test10:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis 3, 2, .LCPI9_0 at toc@ha
-; CHECK-BE-NEXT: xxmrghw 0, 35, 35
-; CHECK-BE-NEXT: lfs 1, .LCPI9_0 at toc@l(3)
-; CHECK-BE-NEXT: xxmrglw 0, 0, 34
-; CHECK-BE-NEXT: xsadddp 1, 0, 1
+; CHECK-BE-NEXT: addi 3, 3, .LCPI9_0 at toc@l
+; CHECK-BE-NEXT: lxv 36, 0(3)
+; CHECK-BE-NEXT: addis 3, 2, .LCPI9_1 at toc@ha
+; CHECK-BE-NEXT: lfs 0, .LCPI9_1 at toc@l(3)
+; CHECK-BE-NEXT: vperm 2, 3, 2, 4
+; CHECK-BE-NEXT: xsadddp 1, 34, 0
; CHECK-BE-NEXT: blr
entry:
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 2, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll b/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll
index 05877a03563c3..9ada7ca658be1 100644
--- a/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_perf_shuffle.ll
@@ -1,4 +1,6 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | not grep vperm
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | not grep vperm
+
+; TODO: Fix this case when disabling perfect shuffle
define <4 x float> @test_uu72(<4 x float>* %P1, <4 x float>* %P2) {
%V1 = load <4 x float>, <4 x float>* %P1 ; <<4 x float>> [#uses=1]
diff --git a/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll b/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll
index 56ed28ab56cc2..6201ac6911f58 100644
--- a/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector.ll
@@ -2,7 +2,7 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu -mattr=+power8-vector < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi -mattr=+power8-vector < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck -check-prefix=CHECK-PWR7 %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi < %s | FileCheck -check-prefix=CHECK-PWR7 %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi < %s | FileCheck -check-prefix=CHECK-PWR7-AIX %s
define void @VPKUDUM_unary(<2 x i64>* %A) {
; CHECK-LABEL: VPKUDUM_unary:
@@ -14,12 +14,22 @@ define void @VPKUDUM_unary(<2 x i64>* %A) {
;
; CHECK-PWR7-LABEL: VPKUDUM_unary:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: lxvw4x 0, 0, 3
-; CHECK-PWR7-NEXT: xxmrglw 1, 0, 0
-; CHECK-PWR7-NEXT: xxmrghw 0, 0, 0
-; CHECK-PWR7-NEXT: xxmrglw 0, 0, 1
-; CHECK-PWR7-NEXT: stxvw4x 0, 0, 3
+; CHECK-PWR7-NEXT: addis 4, 2, .LCPI0_0 at toc@ha
+; CHECK-PWR7-NEXT: lxvw4x 34, 0, 3
+; CHECK-PWR7-NEXT: addi 4, 4, .LCPI0_0 at toc@l
+; CHECK-PWR7-NEXT: lxvw4x 35, 0, 4
+; CHECK-PWR7-NEXT: vperm 2, 2, 2, 3
+; CHECK-PWR7-NEXT: stxvw4x 34, 0, 3
; CHECK-PWR7-NEXT: blr
+;
+; CHECK-PWR7-AIX-LABEL: VPKUDUM_unary:
+; CHECK-PWR7-AIX: # %bb.0: # %entry
+; CHECK-PWR7-AIX-NEXT: ld 4, L..C0(2) # %const.0
+; CHECK-PWR7-AIX-NEXT: lxvw4x 34, 0, 3
+; CHECK-PWR7-AIX-NEXT: lxvw4x 35, 0, 4
+; CHECK-PWR7-AIX-NEXT: vperm 2, 2, 2, 3
+; CHECK-PWR7-AIX-NEXT: stxvw4x 34, 0, 3
+; CHECK-PWR7-AIX-NEXT: blr
entry:
%tmp = load <2 x i64>, <2 x i64>* %A
%tmp2 = bitcast <2 x i64> %tmp to <4 x i32>
@@ -45,13 +55,24 @@ define void @VPKUDUM(<2 x i64>* %A, <2 x i64>* %B) {
;
; CHECK-PWR7-LABEL: VPKUDUM:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: lxvw4x 0, 0, 3
-; CHECK-PWR7-NEXT: lxvw4x 1, 0, 4
-; CHECK-PWR7-NEXT: xxmrglw 2, 0, 1
-; CHECK-PWR7-NEXT: xxmrghw 0, 0, 1
-; CHECK-PWR7-NEXT: xxmrglw 0, 0, 2
-; CHECK-PWR7-NEXT: stxvw4x 0, 0, 3
+; CHECK-PWR7-NEXT: addis 5, 2, .LCPI1_0 at toc@ha
+; CHECK-PWR7-NEXT: lxvw4x 34, 0, 4
+; CHECK-PWR7-NEXT: lxvw4x 35, 0, 3
+; CHECK-PWR7-NEXT: addi 4, 5, .LCPI1_0 at toc@l
+; CHECK-PWR7-NEXT: lxvw4x 36, 0, 4
+; CHECK-PWR7-NEXT: vperm 2, 3, 2, 4
+; CHECK-PWR7-NEXT: stxvw4x 34, 0, 3
; CHECK-PWR7-NEXT: blr
+;
+; CHECK-PWR7-AIX-LABEL: VPKUDUM:
+; CHECK-PWR7-AIX: # %bb.0: # %entry
+; CHECK-PWR7-AIX-NEXT: ld 5, L..C1(2) # %const.0
+; CHECK-PWR7-AIX-NEXT: lxvw4x 34, 0, 4
+; CHECK-PWR7-AIX-NEXT: lxvw4x 35, 0, 3
+; CHECK-PWR7-AIX-NEXT: lxvw4x 36, 0, 5
+; CHECK-PWR7-AIX-NEXT: vperm 2, 3, 2, 4
+; CHECK-PWR7-AIX-NEXT: stxvw4x 34, 0, 3
+; CHECK-PWR7-AIX-NEXT: blr
entry:
%tmp = load <2 x i64>, <2 x i64>* %A
%tmp2 = bitcast <2 x i64> %tmp to <4 x i32>
More information about the llvm-commits
mailing list