[llvm] 34dc1cc - [PowerPC] Exploit the vinsw, vinsd, and vins[wd][lr]x instructions on P10
Amy Kwan via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 18 06:18:02 PST 2021
Author: Baptiste Saleil
Date: 2021-02-18T14:17:47Z
New Revision: 34dc1ccb9606d1de7d4de7286fcc355fa8f7bcd7
URL: https://github.com/llvm/llvm-project/commit/34dc1ccb9606d1de7d4de7286fcc355fa8f7bcd7
DIFF: https://github.com/llvm/llvm-project/commit/34dc1ccb9606d1de7d4de7286fcc355fa8f7bcd7.diff
LOG: [PowerPC] Exploit the vinsw, vinsd, and vins[wd][lr]x instructions on P10
This patch generates the vinsw, vinsd, vinsblx, vinshlx, vinswlx, vinsdlx,
vinsbrx, vinshrx, vinswrx and vinsdrx instructions for vector insertion on P10.
Differential Revision: https://reviews.llvm.org/D94454
Added:
llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
Modified:
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/lib/Target/PowerPC/PPCInstrPrefix.td
llvm/lib/Target/PowerPC/PPCInstrVSX.td
llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 116ab69fe008..af35f10b08c9 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1233,6 +1233,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
}
+
+ if (Subtarget.isISA3_1()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ }
}
if (Subtarget.pairedVectorMemops()) {
@@ -10041,14 +10046,34 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
"Should only be called for ISD::INSERT_VECTOR_ELT");
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
- // We have legal lowering for constant indices but not for variable ones.
- if (!C)
- return SDValue();
EVT VT = Op.getValueType();
SDLoc dl(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
+ SDValue V3 = Op.getOperand(2);
+
+ if (Subtarget.isISA3_1()) {
+ // On P10, we have legal lowering for constant and variable indices for
+ // integer vectors.
+ if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v2i64)
+ return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3);
+ // For f32 and f64 vectors, we have legal lowering for variable indices.
+ // For f32 we also have legal lowering when the element is loaded from
+ // memory.
+ if (VT == MVT::v4f32 || VT == MVT::v2f64) {
+ if (!C || (VT == MVT::v4f32 && dyn_cast<LoadSDNode>(V2)))
+ return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3);
+ return SDValue();
+ }
+ }
+
+ // Before P10, we have legal lowering for constant indices but not for
+ // variable ones.
+ if (!C)
+ return SDValue();
+
// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
if (VT == MVT::v8i16 || VT == MVT::v16i8) {
SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 5c1fb6a51d87..0c6749c8d235 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -26,6 +26,9 @@ def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
]>;
+def SDT_PPCVecInsertElt : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<3>
+]>;
//===----------------------------------------------------------------------===//
// ISA 3.1 specific PPCISD nodes.
@@ -39,6 +42,7 @@ def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
[]>;
def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
+def PPCvecinsertelt : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsertElt, []>;
//===----------------------------------------------------------------------===//
@@ -2665,3 +2669,99 @@ let Predicates = [PrefixInstrs] in {
(XXBLENDVD $A, $B, $C)>;
}
+def InsertEltShift {
+ dag Sub32Left0 = (EXTRACT_SUBREG $rB, sub_32);
+ dag Sub32Left1 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 1, 0, 30);
+ dag Sub32Left2 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 2, 0, 29);
+ dag Left3 = (RLWINM8 $rB, 3, 0, 28);
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in {
+ // Indexed vector insert element
+ def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)),
+ (VINSBRX $vDi, InsertEltShift.Sub32Left0, $rA)>;
+ def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)),
+ (VINSHRX $vDi, InsertEltShift.Sub32Left1, $rA)>;
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, $rA)>;
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, $rA)>;
+
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, f32:$A, i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
+ (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
+
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, f64:$A, i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+ (VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
+
+ // Immediate vector insert element
+ foreach i = [0, 1, 2, 3] in {
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), $rA)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(!sub(3, i), 4), (LWZX memrr:$rA))>;
+ }
+ foreach i = [0, 1] in
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))),
+ (VINSD $vDi, !mul(!sub(1, i), 8), $rA)>;
+}
+
+let Predicates = [IsISA3_1, HasVSX, IsBigEndian] in {
+ // Indexed vector insert element
+ def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)),
+ (VINSBLX $vDi, InsertEltShift.Sub32Left0, $rA)>;
+ def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)),
+ (VINSHLX $vDi, InsertEltShift.Sub32Left1, $rA)>;
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, $rA)>;
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, $rA)>;
+
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, f32:$A, i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
+ (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
+
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, f64:$A, i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
+ def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+ (VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
+
+ // Immediate vector insert element
+ foreach i = [0, 1, 2, 3] in {
+ def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))),
+ (VINSW $vDi, !mul(i, 4), $rA)>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(i, 4), (LWZ memri:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(i, 4), (PLWZ memri34:$rA))>;
+ def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))),
+ (VINSW $vDi, !mul(i, 4), (LWZX memrr:$rA))>;
+ }
+ foreach i = [0, 1] in
+ def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))),
+ (VINSD $vDi, !mul(i, 8), $rA)>;
+}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 7eace18b7364..e8babce4fb20 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1810,6 +1810,14 @@ let PPC970_Single = 1, AddedComplexity = 400 in {
}
//----------------------------- DAG Definitions ------------------------------//
+
+// Output dag used to bitcast f32 to i32 and f64 to i64
+def Bitcast {
+ dag FltToInt = (i32 (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI (XSCVDPSPN $A),
+ (XSCVDPSPN $A), 3), sub_64)));
+ dag DblToLong = (i64 (MFVSRD $A));
+}
+
def FpMinMax {
dag F32Min = (COPY_TO_REGCLASS (XSMINDP (COPY_TO_REGCLASS $A, VSFRC),
(COPY_TO_REGCLASS $B, VSFRC)),
@@ -3345,10 +3353,8 @@ def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
let Predicates = [HasVSX, HasDirectMove] in {
// bitconvert f32 -> i32
// (convert to 32-bit fp single, shift right 1 word, move to GPR)
-def : Pat<(i32 (bitconvert f32:$S)),
- (i32 (MFVSRWZ (EXTRACT_SUBREG
- (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3),
- sub_64)))>;
+def : Pat<(i32 (bitconvert f32:$A)), Bitcast.FltToInt>;
+
// bitconvert i32 -> f32
// (move to FPR, shift left 1 word, convert to 64-bit fp single)
def : Pat<(f32 (bitconvert i32:$A)),
@@ -3357,8 +3363,7 @@ def : Pat<(f32 (bitconvert i32:$A)),
// bitconvert f64 -> i64
// (move to GPR, nothing else needed)
-def : Pat<(i64 (bitconvert f64:$S)),
- (i64 (MFVSRD $S))>;
+def : Pat<(i64 (bitconvert f64:$A)), Bitcast.DblToLong>;
// bitconvert i64 -> f64
// (move to FPR, nothing else needed)
diff --git a/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll b/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll
index 604d57aa7f85..f6b4760659a1 100644
--- a/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll
+++ b/llvm/test/CodeGen/PowerPC/pcrel-linkeropt.ll
@@ -42,11 +42,11 @@ define dso_local void @ReadWrite8() local_unnamed_addr #0 {
; CHECK-NEXT: pld r4, output8 at got@pcrel(0), 1
; CHECK-NEXT: .reloc .Lpcrel0-8,R_PPC64_PCREL_OPT,.-(.Lpcrel0-8)
; CHECK-NEXT: lbz r3, 0(r3)
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
; In this test the stb r3, 0(r4) cannot be optimized because it
; uses the register r3 and that register is defined by lbz r3, 0(r3)
; which is defined between the pld and the stb.
-; CHECK-NEXT: stb r3, 0(r4)
-; CHECK-NEXT: blr
entry:
%0 = load i8, i8* @input8, align 1
store i8 %0, i8* @output8, align 1
@@ -61,11 +61,11 @@ define dso_local void @ReadWrite16() local_unnamed_addr #0 {
; CHECK-NEXT: pld r4, output16 at got@pcrel(0), 1
; CHECK-NEXT: .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8)
; CHECK-NEXT: lhz r3, 0(r3)
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
; In this test the sth r3, 0(r4) cannot be optimized because it
; uses the register r3 and that register is defined by lhz r3, 0(r3)
; which is defined between the pld and the sth.
-; CHECK-NEXT: sth r3, 0(r4)
-; CHECK-NEXT: blr
entry:
%0 = load i16, i16* @input16, align 2
store i16 %0, i16* @output16, align 2
@@ -165,11 +165,10 @@ define dso_local void @ReadWriteVi32() local_unnamed_addr #0 {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pld r3, inputVi32 at got@pcrel(0), 1
; CHECK-NEXT: li r4, 45
-; CHECK-NEXT: mtfprwz f1, r4
-; CHECK-NEXT: lxvx vs0, 0, r3
+; CHECK-NEXT: lxvx v2, 0, r3
; CHECK-NEXT: pld r3, outputVi32 at got@pcrel(0), 1
-; CHECK-NEXT: xxinsertw vs0, vs1, 8
-; CHECK-NEXT: stxvx vs0, 0, r3
+; CHECK-NEXT: vinsw v2, r4, 8
+; CHECK-NEXT: stxvx v2, 0, r3
; CHECK-NEXT: blr
entry:
%0 = load <4 x i32>, <4 x i32>* @inputVi32, align 16
@@ -286,8 +285,7 @@ declare void @Callee(...)
define dso_local void @FuncPtrCall() local_unnamed_addr #0 {
; CHECK-LABEL: FuncPtrCall:
-; CHECK: .localentry FuncPtrCall, 1
-; CHECK-NEXT: # %bb.0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pld r3, FuncPtrIn at got@pcrel(0), 1
; CHECK-NEXT: .Lpcrel10:
; CHECK-NEXT: .reloc .Lpcrel10-8,R_PPC64_PCREL_OPT,.-(.Lpcrel10-8)
@@ -317,8 +315,7 @@ entry:
define dso_local signext i32 @VecMultiUse() local_unnamed_addr #0 {
; CHECK-LABEL: VecMultiUse:
-; CHECK: .localentry VecMultiUse, 1
-; CHECK-NEXT: # %bb.0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mflr r0
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
@@ -355,8 +352,7 @@ entry:
define dso_local signext i32 @UseAddr(i32 signext %a) local_unnamed_addr #0 {
; CHECK-LABEL: UseAddr:
-; CHECK: .localentry UseAddr, 1
-; CHECK-NEXT: # %bb.0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mflr r0
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r0, 16(r1)
diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
new file mode 100644
index 000000000000..b6d6edfb7c52
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
@@ -0,0 +1,740 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-P9
+
+; Byte indexed
+
+define <16 x i8> @testByte(<16 x i8> %a, i64 %b, i64 %idx) {
+; CHECK-LABEL: testByte:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vinsbrx v2, r6, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testByte:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: vinsblx v2, r6, r5
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testByte:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: addi r4, r1, -16
+; CHECK-P9-NEXT: clrldi r3, r6, 60
+; CHECK-P9-NEXT: stxv v2, -16(r1)
+; CHECK-P9-NEXT: stbx r5, r4, r3
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %conv = trunc i64 %b to i8
+ %vecins = insertelement <16 x i8> %a, i8 %conv, i64 %idx
+ ret <16 x i8> %vecins
+}
+
+; Halfword indexed
+
+define <8 x i16> @testHalf(<8 x i16> %a, i64 %b, i64 %idx) {
+; CHECK-LABEL: testHalf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: slwi r3, r6, 1
+; CHECK-NEXT: vinshrx v2, r3, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testHalf:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: slwi r3, r6, 1
+; CHECK-BE-NEXT: vinshlx v2, r3, r5
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testHalf:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: addi r4, r1, -16
+; CHECK-P9-NEXT: rlwinm r3, r6, 1, 28, 30
+; CHECK-P9-NEXT: stxv v2, -16(r1)
+; CHECK-P9-NEXT: sthx r5, r4, r3
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %conv = trunc i64 %b to i16
+ %vecins = insertelement <8 x i16> %a, i16 %conv, i64 %idx
+ ret <8 x i16> %vecins
+}
+
+; Word indexed
+
+define <4 x i32> @testWord(<4 x i32> %a, i64 %b, i64 %idx) {
+; CHECK-LABEL: testWord:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: slwi r3, r6, 2
+; CHECK-NEXT: vinswrx v2, r3, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testWord:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: slwi r3, r6, 2
+; CHECK-BE-NEXT: vinswlx v2, r3, r5
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testWord:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: addi r4, r1, -16
+; CHECK-P9-NEXT: rlwinm r3, r6, 2, 28, 29
+; CHECK-P9-NEXT: stxv v2, -16(r1)
+; CHECK-P9-NEXT: stwx r5, r4, r3
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %conv = trunc i64 %b to i32
+ %vecins = insertelement <4 x i32> %a, i32 %conv, i64 %idx
+ ret <4 x i32> %vecins
+}
+
+; Word immediate
+
+define <4 x i32> @testWordImm(<4 x i32> %a, i64 %b) {
+; CHECK-LABEL: testWordImm:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vinsw v2, r5, 8
+; CHECK-NEXT: vinsw v2, r5, 0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testWordImm:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: vinsw v2, r5, 4
+; CHECK-BE-NEXT: vinsw v2, r5, 12
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testWordImm:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: mtfprwz f0, r5
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 4
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 12
+; CHECK-P9-NEXT: blr
+entry:
+ %conv = trunc i64 %b to i32
+ %vecins = insertelement <4 x i32> %a, i32 %conv, i32 1
+ %vecins2 = insertelement <4 x i32> %vecins, i32 %conv, i32 3
+ ret <4 x i32> %vecins2
+}
+
+; Doubleword indexed
+
+define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
+; CHECK-LABEL: testDoubleword:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: rlwinm r3, r6, 3, 0, 28
+; CHECK-NEXT: vinsdrx v2, r3, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoubleword:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: rlwinm r3, r6, 3, 0, 28
+; CHECK-BE-NEXT: vinsdlx v2, r3, r5
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoubleword:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: addi r4, r1, -16
+; CHECK-P9-NEXT: rlwinm r3, r6, 3, 28, 28
+; CHECK-P9-NEXT: stxv v2, -16(r1)
+; CHECK-P9-NEXT: stdx r5, r4, r3
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %vecins = insertelement <2 x i64> %a, i64 %b, i64 %idx
+ ret <2 x i64> %vecins
+}
+
+; Doubleword immediate
+
+define <2 x i64> @testDoublewordImm(<2 x i64> %a, i64 %b) {
+; CHECK-LABEL: testDoublewordImm:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vinsd v2, r5, 0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoublewordImm:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: vinsd v2, r5, 8
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoublewordImm:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: mtfprd f0, r5
+; CHECK-P9-NEXT: xxmrghd v2, v2, vs0
+; CHECK-P9-NEXT: blr
+entry:
+ %vecins = insertelement <2 x i64> %a, i64 %b, i32 1
+ ret <2 x i64> %vecins
+}
+
+define <2 x i64> @testDoublewordImm2(<2 x i64> %a, i64 %b) {
+; CHECK-LABEL: testDoublewordImm2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vinsd v2, r5, 8
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoublewordImm2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: vinsd v2, r5, 0
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoublewordImm2:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: mtfprd f0, r5
+; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-P9-NEXT: blr
+entry:
+ %vecins = insertelement <2 x i64> %a, i64 %b, i32 0
+ ret <2 x i64> %vecins
+}
+
+; Float indexed
+
+define <4 x float> @testFloat1(<4 x float> %a, float %b, i32 zeroext %idx1) {
+; CHECK-LABEL: testFloat1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xscvdpspn vs0, f1
+; CHECK-NEXT: extsw r3, r6
+; CHECK-NEXT: slwi r3, r3, 2
+; CHECK-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-NEXT: mffprwz r4, f0
+; CHECK-NEXT: vinswrx v2, r3, r4
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testFloat1:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: xscvdpspn vs0, f1
+; CHECK-BE-NEXT: extsw r3, r6
+; CHECK-BE-NEXT: slwi r3, r3, 2
+; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-BE-NEXT: mffprwz r4, f0
+; CHECK-BE-NEXT: vinswlx v2, r3, r4
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testFloat1:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: addi r4, r1, -16
+; CHECK-P9-NEXT: rlwinm r3, r6, 2, 28, 29
+; CHECK-P9-NEXT: stxv v2, -16(r1)
+; CHECK-P9-NEXT: stfsx f1, r4, r3
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %vecins = insertelement <4 x float> %a, float %b, i32 %idx1
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @testFloat2(<4 x float> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-LABEL: testFloat2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lwz r3, 0(r5)
+; CHECK-NEXT: extsw r4, r6
+; CHECK-NEXT: slwi r4, r4, 2
+; CHECK-NEXT: vinswrx v2, r4, r3
+; CHECK-NEXT: lwz r3, 1(r5)
+; CHECK-NEXT: extsw r4, r7
+; CHECK-NEXT: slwi r4, r4, 2
+; CHECK-NEXT: vinswrx v2, r4, r3
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testFloat2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lwz r3, 0(r5)
+; CHECK-BE-NEXT: extsw r4, r6
+; CHECK-BE-NEXT: slwi r4, r4, 2
+; CHECK-BE-NEXT: vinswlx v2, r4, r3
+; CHECK-BE-NEXT: lwz r3, 1(r5)
+; CHECK-BE-NEXT: extsw r4, r7
+; CHECK-BE-NEXT: slwi r4, r4, 2
+; CHECK-BE-NEXT: vinswlx v2, r4, r3
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testFloat2:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lwz r3, 0(r5)
+; CHECK-P9-NEXT: rlwinm r4, r6, 2, 28, 29
+; CHECK-P9-NEXT: addi r6, r1, -32
+; CHECK-P9-NEXT: stxv v2, -32(r1)
+; CHECK-P9-NEXT: stwx r3, r6, r4
+; CHECK-P9-NEXT: rlwinm r4, r7, 2, 28, 29
+; CHECK-P9-NEXT: lxv vs0, -32(r1)
+; CHECK-P9-NEXT: lwz r3, 1(r5)
+; CHECK-P9-NEXT: addi r5, r1, -16
+; CHECK-P9-NEXT: stxv vs0, -16(r1)
+; CHECK-P9-NEXT: stwx r3, r5, r4
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %0 = bitcast i8* %b to float*
+ %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 1
+ %1 = bitcast i8* %add.ptr1 to float*
+ %2 = load float, float* %0, align 4
+ %vecins = insertelement <4 x float> %a, float %2, i32 %idx1
+ %3 = load float, float* %1, align 4
+ %vecins2 = insertelement <4 x float> %vecins, float %3, i32 %idx2
+ ret <4 x float> %vecins2
+}
+
+define <4 x float> @testFloat3(<4 x float> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-LABEL: testFloat3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: plwz r3, 65536(r5), 0
+; CHECK-NEXT: extsw r4, r6
+; CHECK-NEXT: slwi r4, r4, 2
+; CHECK-NEXT: vinswrx v2, r4, r3
+; CHECK-NEXT: li r3, 1
+; CHECK-NEXT: extsw r4, r7
+; CHECK-NEXT: rldic r3, r3, 36, 27
+; CHECK-NEXT: slwi r4, r4, 2
+; CHECK-NEXT: lwzx r3, r5, r3
+; CHECK-NEXT: vinswrx v2, r4, r3
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testFloat3:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: plwz r3, 65536(r5), 0
+; CHECK-BE-NEXT: extsw r4, r6
+; CHECK-BE-NEXT: slwi r4, r4, 2
+; CHECK-BE-NEXT: vinswlx v2, r4, r3
+; CHECK-BE-NEXT: li r3, 1
+; CHECK-BE-NEXT: extsw r4, r7
+; CHECK-BE-NEXT: rldic r3, r3, 36, 27
+; CHECK-BE-NEXT: slwi r4, r4, 2
+; CHECK-BE-NEXT: lwzx r3, r5, r3
+; CHECK-BE-NEXT: vinswlx v2, r4, r3
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testFloat3:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lis r3, 1
+; CHECK-P9-NEXT: rlwinm r4, r6, 2, 28, 29
+; CHECK-P9-NEXT: addi r6, r1, -32
+; CHECK-P9-NEXT: lwzx r3, r5, r3
+; CHECK-P9-NEXT: stxv v2, -32(r1)
+; CHECK-P9-NEXT: stwx r3, r6, r4
+; CHECK-P9-NEXT: li r3, 1
+; CHECK-P9-NEXT: rlwinm r4, r7, 2, 28, 29
+; CHECK-P9-NEXT: lxv vs0, -32(r1)
+; CHECK-P9-NEXT: rldic r3, r3, 36, 27
+; CHECK-P9-NEXT: lwzx r3, r5, r3
+; CHECK-P9-NEXT: addi r5, r1, -16
+; CHECK-P9-NEXT: stxv vs0, -16(r1)
+; CHECK-P9-NEXT: stwx r3, r5, r4
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %add.ptr = getelementptr inbounds i8, i8* %b, i64 65536
+ %0 = bitcast i8* %add.ptr to float*
+ %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 68719476736
+ %1 = bitcast i8* %add.ptr1 to float*
+ %2 = load float, float* %0, align 4
+ %vecins = insertelement <4 x float> %a, float %2, i32 %idx1
+ %3 = load float, float* %1, align 4
+ %vecins2 = insertelement <4 x float> %vecins, float %3, i32 %idx2
+ ret <4 x float> %vecins2
+}
+
+; Float immediate
+
+define <4 x float> @testFloatImm1(<4 x float> %a, float %b) {
+; CHECK-LABEL: testFloatImm1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xscvdpspn vs0, f1
+; CHECK-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-NEXT: xxinsertw v2, vs0, 12
+; CHECK-NEXT: xxinsertw v2, vs0, 4
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testFloatImm1:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: xscvdpspn vs0, f1
+; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-BE-NEXT: xxinsertw v2, vs0, 0
+; CHECK-BE-NEXT: xxinsertw v2, vs0, 8
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testFloatImm1:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: xscvdpspn vs0, f1
+; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 0
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 8
+; CHECK-P9-NEXT: blr
+entry:
+ %vecins = insertelement <4 x float> %a, float %b, i32 0
+ %vecins1 = insertelement <4 x float> %vecins, float %b, i32 2
+ ret <4 x float> %vecins1
+}
+
+define <4 x float> @testFloatImm2(<4 x float> %a, i32* %b) {
+; CHECK-LABEL: testFloatImm2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lwz r3, 0(r5)
+; CHECK-NEXT: vinsw v2, r3, 12
+; CHECK-NEXT: lwz r3, 4(r5)
+; CHECK-NEXT: vinsw v2, r3, 4
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testFloatImm2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lwz r3, 0(r5)
+; CHECK-BE-NEXT: vinsw v2, r3, 0
+; CHECK-BE-NEXT: lwz r3, 4(r5)
+; CHECK-BE-NEXT: vinsw v2, r3, 8
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testFloatImm2:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lfs f0, 0(r5)
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
+; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 0
+; CHECK-P9-NEXT: lfs f0, 4(r5)
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
+; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 8
+; CHECK-P9-NEXT: blr
+entry:
+ %0 = bitcast i32* %b to float*
+ %add.ptr1 = getelementptr inbounds i32, i32* %b, i64 1
+ %1 = bitcast i32* %add.ptr1 to float*
+ %2 = load float, float* %0, align 4
+ %vecins = insertelement <4 x float> %a, float %2, i32 0
+ %3 = load float, float* %1, align 4
+ %vecins2 = insertelement <4 x float> %vecins, float %3, i32 2
+ ret <4 x float> %vecins2
+}
+
+define <4 x float> @testFloatImm3(<4 x float> %a, i32* %b) {
+; CHECK-LABEL: testFloatImm3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: plwz r3, 262144(r5), 0
+; CHECK-NEXT: vinsw v2, r3, 12
+; CHECK-NEXT: li r3, 1
+; CHECK-NEXT: rldic r3, r3, 38, 25
+; CHECK-NEXT: lwzx r3, r5, r3
+; CHECK-NEXT: vinsw v2, r3, 4
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testFloatImm3:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: plwz r3, 262144(r5), 0
+; CHECK-BE-NEXT: vinsw v2, r3, 0
+; CHECK-BE-NEXT: li r3, 1
+; CHECK-BE-NEXT: rldic r3, r3, 38, 25
+; CHECK-BE-NEXT: lwzx r3, r5, r3
+; CHECK-BE-NEXT: vinsw v2, r3, 8
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testFloatImm3:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lis r3, 4
+; CHECK-P9-NEXT: lfsx f0, r5, r3
+; CHECK-P9-NEXT: li r3, 1
+; CHECK-P9-NEXT: rldic r3, r3, 38, 25
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
+; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 0
+; CHECK-P9-NEXT: lfsx f0, r5, r3
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
+; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 3
+; CHECK-P9-NEXT: xxinsertw v2, vs0, 8
+; CHECK-P9-NEXT: blr
+entry:
+ %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536
+ %0 = bitcast i32* %add.ptr to float*
+ %add.ptr1 = getelementptr inbounds i32, i32* %b, i64 68719476736
+ %1 = bitcast i32* %add.ptr1 to float*
+ %2 = load float, float* %0, align 4
+ %vecins = insertelement <4 x float> %a, float %2, i32 0
+ %3 = load float, float* %1, align 4
+ %vecins2 = insertelement <4 x float> %vecins, float %3, i32 2
+ ret <4 x float> %vecins2
+}
+
+; Double indexed
+
+define <2 x double> @testDouble1(<2 x double> %a, double %b, i32 zeroext %idx1) {
+; CHECK-LABEL: testDouble1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: extsw r4, r6
+; CHECK-NEXT: mffprd r3, f1
+; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-NEXT: vinsdrx v2, r4, r3
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDouble1:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: extsw r4, r6
+; CHECK-BE-NEXT: mffprd r3, f1
+; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-BE-NEXT: vinsdlx v2, r4, r3
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDouble1:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: addi r4, r1, -16
+; CHECK-P9-NEXT: rlwinm r3, r6, 3, 28, 28
+; CHECK-P9-NEXT: stxv v2, -16(r1)
+; CHECK-P9-NEXT: stfdx f1, r4, r3
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %vecins = insertelement <2 x double> %a, double %b, i32 %idx1
+ ret <2 x double> %vecins
+}
+
+define <2 x double> @testDouble2(<2 x double> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-LABEL: testDouble2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ld r3, 0(r5)
+; CHECK-NEXT: extsw r4, r6
+; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-NEXT: vinsdrx v2, r4, r3
+; CHECK-NEXT: pld r3, 1(r5), 0
+; CHECK-NEXT: extsw r4, r7
+; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-NEXT: vinsdrx v2, r4, r3
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDouble2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: ld r3, 0(r5)
+; CHECK-BE-NEXT: extsw r4, r6
+; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-BE-NEXT: vinsdlx v2, r4, r3
+; CHECK-BE-NEXT: pld r3, 1(r5), 0
+; CHECK-BE-NEXT: extsw r4, r7
+; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-BE-NEXT: vinsdlx v2, r4, r3
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDouble2:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: ld r3, 0(r5)
+; CHECK-P9-NEXT: rlwinm r4, r6, 3, 28, 28
+; CHECK-P9-NEXT: addi r6, r1, -32
+; CHECK-P9-NEXT: stxv v2, -32(r1)
+; CHECK-P9-NEXT: stdx r3, r6, r4
+; CHECK-P9-NEXT: li r3, 1
+; CHECK-P9-NEXT: rlwinm r4, r7, 3, 28, 28
+; CHECK-P9-NEXT: lxv vs0, -32(r1)
+; CHECK-P9-NEXT: ldx r3, r5, r3
+; CHECK-P9-NEXT: addi r5, r1, -16
+; CHECK-P9-NEXT: stxv vs0, -16(r1)
+; CHECK-P9-NEXT: stdx r3, r5, r4
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %0 = bitcast i8* %b to double*
+ %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 1
+ %1 = bitcast i8* %add.ptr1 to double*
+ %2 = load double, double* %0, align 8
+ %vecins = insertelement <2 x double> %a, double %2, i32 %idx1
+ %3 = load double, double* %1, align 8
+ %vecins2 = insertelement <2 x double> %vecins, double %3, i32 %idx2
+ ret <2 x double> %vecins2
+}
+
+define <2 x double> @testDouble3(<2 x double> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-LABEL: testDouble3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pld r3, 65536(r5), 0
+; CHECK-NEXT: extsw r4, r6
+; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-NEXT: vinsdrx v2, r4, r3
+; CHECK-NEXT: li r3, 1
+; CHECK-NEXT: extsw r4, r7
+; CHECK-NEXT: rldic r3, r3, 36, 27
+; CHECK-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-NEXT: ldx r3, r5, r3
+; CHECK-NEXT: vinsdrx v2, r4, r3
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDouble3:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: pld r3, 65536(r5), 0
+; CHECK-BE-NEXT: extsw r4, r6
+; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-BE-NEXT: vinsdlx v2, r4, r3
+; CHECK-BE-NEXT: li r3, 1
+; CHECK-BE-NEXT: extsw r4, r7
+; CHECK-BE-NEXT: rldic r3, r3, 36, 27
+; CHECK-BE-NEXT: rlwinm r4, r4, 3, 0, 28
+; CHECK-BE-NEXT: ldx r3, r5, r3
+; CHECK-BE-NEXT: vinsdlx v2, r4, r3
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDouble3:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lis r3, 1
+; CHECK-P9-NEXT: rlwinm r4, r6, 3, 28, 28
+; CHECK-P9-NEXT: addi r6, r1, -32
+; CHECK-P9-NEXT: ldx r3, r5, r3
+; CHECK-P9-NEXT: stxv v2, -32(r1)
+; CHECK-P9-NEXT: stdx r3, r6, r4
+; CHECK-P9-NEXT: li r3, 1
+; CHECK-P9-NEXT: rlwinm r4, r7, 3, 28, 28
+; CHECK-P9-NEXT: lxv vs0, -32(r1)
+; CHECK-P9-NEXT: rldic r3, r3, 36, 27
+; CHECK-P9-NEXT: ldx r3, r5, r3
+; CHECK-P9-NEXT: addi r5, r1, -16
+; CHECK-P9-NEXT: stxv vs0, -16(r1)
+; CHECK-P9-NEXT: stdx r3, r5, r4
+; CHECK-P9-NEXT: lxv v2, -16(r1)
+; CHECK-P9-NEXT: blr
+entry:
+ %add.ptr = getelementptr inbounds i8, i8* %b, i64 65536
+ %0 = bitcast i8* %add.ptr to double*
+ %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 68719476736
+ %1 = bitcast i8* %add.ptr1 to double*
+ %2 = load double, double* %0, align 8
+ %vecins = insertelement <2 x double> %a, double %2, i32 %idx1
+ %3 = load double, double* %1, align 8
+ %vecins2 = insertelement <2 x double> %vecins, double %3, i32 %idx2
+ ret <2 x double> %vecins2
+}
+
+; Double immediate
+
+define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
+; CHECK-LABEL: testDoubleImm1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; CHECK-NEXT: xxmrghd v2, v2, vs1
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoubleImm1:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; CHECK-BE-NEXT: xxpermdi v2, vs1, v2, 1
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoubleImm1:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; CHECK-P9-NEXT: xxpermdi v2, vs1, v2, 1
+; CHECK-P9-NEXT: blr
+entry:
+ %vecins = insertelement <2 x double> %a, double %b, i32 0
+ ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm2(<2 x double> %a, i32* %b) {
+; CHECK-LABEL: testDoubleImm2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lfd f0, 0(r5)
+; CHECK-NEXT: xxmrghd v2, v2, vs0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoubleImm2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lfd f0, 0(r5)
+; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoubleImm2:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lfd f0, 0(r5)
+; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-P9-NEXT: blr
+entry:
+ %0 = bitcast i32* %b to double*
+ %1 = load double, double* %0, align 8
+ %vecins = insertelement <2 x double> %a, double %1, i32 0
+ ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm3(<2 x double> %a, i32* %b) {
+; CHECK-LABEL: testDoubleImm3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lfd f0, 4(r5)
+; CHECK-NEXT: xxmrghd v2, v2, vs0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoubleImm3:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lfd f0, 4(r5)
+; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoubleImm3:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lfd f0, 4(r5)
+; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-P9-NEXT: blr
+entry:
+ %add.ptr = getelementptr inbounds i32, i32* %b, i64 1
+ %0 = bitcast i32* %add.ptr to double*
+ %1 = load double, double* %0, align 8
+ %vecins = insertelement <2 x double> %a, double %1, i32 0
+ ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm4(<2 x double> %a, i32* %b) {
+; CHECK-LABEL: testDoubleImm4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lis r3, 4
+; CHECK-NEXT: lfdx f0, r5, r3
+; CHECK-NEXT: xxmrghd v2, v2, vs0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoubleImm4:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lis r3, 4
+; CHECK-BE-NEXT: lfdx f0, r5, r3
+; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoubleImm4:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: lis r3, 4
+; CHECK-P9-NEXT: lfdx f0, r5, r3
+; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-P9-NEXT: blr
+entry:
+ %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536
+ %0 = bitcast i32* %add.ptr to double*
+ %1 = load double, double* %0, align 8
+ %vecins = insertelement <2 x double> %a, double %1, i32 0
+ ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm5(<2 x double> %a, i32* %b) {
+; CHECK-LABEL: testDoubleImm5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li r3, 1
+; CHECK-NEXT: rldic r3, r3, 38, 25
+; CHECK-NEXT: lfdx f0, r5, r3
+; CHECK-NEXT: xxmrghd v2, v2, vs0
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: testDoubleImm5:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: li r3, 1
+; CHECK-BE-NEXT: rldic r3, r3, 38, 25
+; CHECK-BE-NEXT: lfdx f0, r5, r3
+; CHECK-BE-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P9-LABEL: testDoubleImm5:
+; CHECK-P9: # %bb.0: # %entry
+; CHECK-P9-NEXT: li r3, 1
+; CHECK-P9-NEXT: rldic r3, r3, 38, 25
+; CHECK-P9-NEXT: lfdx f0, r5, r3
+; CHECK-P9-NEXT: xxpermdi v2, vs0, v2, 1
+; CHECK-P9-NEXT: blr
+entry:
+ %add.ptr = getelementptr inbounds i32, i32* %b, i64 68719476736
+ %0 = bitcast i32* %add.ptr to double*
+ %1 = load double, double* %0, align 8
+ %vecins = insertelement <2 x double> %a, double %1, i32 0
+ ret <2 x double> %vecins
+}
+
More information about the llvm-commits
mailing list