[llvm] f4ea105 - [SystemZ] Implement i128 funnel shifts
Ulrich Weigand via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 15 10:29:26 PDT 2025
Author: Ulrich Weigand
Date: 2025-03-15T18:28:44+01:00
New Revision: f4ea1055ad574e0e17fe19662a8c8bcf17e64bfe
URL: https://github.com/llvm/llvm-project/commit/f4ea1055ad574e0e17fe19662a8c8bcf17e64bfe
DIFF: https://github.com/llvm/llvm-project/commit/f4ea1055ad574e0e17fe19662a8c8bcf17e64bfe.diff
LOG: [SystemZ] Implement i128 funnel shifts
These can be handled via the VECTOR SHIFT LEFT/RIGHT DOUBLE
family of instructions, depending on architecture level.
Fixes: https://github.com/llvm/llvm-project/issues/129955
Added:
llvm/test/CodeGen/SystemZ/shift-17.ll
Modified:
llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
llvm/lib/Target/SystemZ/SystemZISelLowering.h
llvm/lib/Target/SystemZ/SystemZInstrVector.td
llvm/lib/Target/SystemZ/SystemZOperators.td
llvm/test/CodeGen/SystemZ/rot-03.ll
llvm/test/CodeGen/SystemZ/shift-16.ll
llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 895d3c214a03c..66be2478f2891 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -254,6 +254,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ROTR, MVT::i128, Expand);
setOperationAction(ISD::ROTL, MVT::i128, Expand);
+ // We may be able to use VSLDB/VSLD/VSRD for these.
+ setOperationAction(ISD::FSHL, MVT::i128, Custom);
+ setOperationAction(ISD::FSHR, MVT::i128, Custom);
+
// No special instructions for these before arch15.
if (!Subtarget.hasVectorEnhancements3()) {
setOperationAction(ISD::MUL, MVT::i128, Expand);
@@ -6644,6 +6648,66 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
return Op;
}
+SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ // i128 FSHL with a constant amount that is a multiple of 8 can be
+ // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+ // facility, FSHL with a constant amount less than 8 can be implemented
+ // via SHL_DOUBLE_BIT, and FSHL with other constant amounts by a
+ // combination of the two.
+ if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
+ uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
+ if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
+ SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+ SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ SmallVector<int, 16> Mask(16);
+ for (unsigned Elt = 0; Elt < 16; Elt++)
+ Mask[Elt] = (ShiftAmt >> 3) + Elt;
+ SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+ if ((ShiftAmt & 7) == 0)
+ return DAG.getBitcast(MVT::i128, Shuf1);
+ SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
+ DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ // i128 FSHR with a constant amount that is a multiple of 8 can be
+ // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+ // facility, FSHR with a constant amount less than 8 can be implemented
+ // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
+ // combination of the two.
+ if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
+ uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
+ if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
+ SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+ SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ SmallVector<int, 16> Mask(16);
+ for (unsigned Elt = 0; Elt < 16; Elt++)
+ Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
+ SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+ if ((ShiftAmt & 7) == 0)
+ return DAG.getBitcast(MVT::i128, Shuf1);
+ SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
+ DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+ }
+ }
+
+ return SDValue();
+}
+
static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
SDValue Src = Op.getOperand(0);
@@ -6853,6 +6917,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerAddrSpaceCast(Op, DAG);
case ISD::ROTL:
return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
+ case ISD::FSHL:
+ return lowerFSHL(Op, DAG);
+ case ISD::FSHR:
+ return lowerFSHR(Op, DAG);
case ISD::IS_FPCLASS:
return lowerIS_FPCLASS(Op, DAG);
case ISD::GET_ROUNDING:
@@ -7063,6 +7131,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(VSRL_BY_SCALAR);
OPCODE(VSRA_BY_SCALAR);
OPCODE(VROTL_BY_SCALAR);
+ OPCODE(SHL_DOUBLE_BIT);
+ OPCODE(SHR_DOUBLE_BIT);
OPCODE(VSUM);
OPCODE(VACC);
OPCODE(VSCBI);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index acdb8cb4cb842..a97962c17767c 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -222,6 +222,11 @@ enum NodeType : unsigned {
VSRA_BY_SCALAR,
VROTL_BY_SCALAR,
+ // Concatenate the vectors in the first two operands, shift them left/right
+ // bitwise by the third operand, and take the first/last half of the result.
+ SHL_DOUBLE_BIT,
+ SHR_DOUBLE_BIT,
+
// For each element of the output type, sum across all sub-elements of
// operand 0 belonging to the corresponding element, and add in the
// rightmost sub-element of the corresponding element of operand 1.
@@ -736,6 +741,8 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
+ SDValue lowerFSHL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFSHR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 29c92915c2317..3187d91b00046 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -974,8 +974,11 @@ let Predicates = [FeatureVector] in {
(VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
// Shift left double by bit.
- let Predicates = [FeatureVectorEnhancements2] in
- def VSLD : TernaryVRId<"vsld", 0xE786, int_s390_vsld, v128b, v128b, 0>;
+ let Predicates = [FeatureVectorEnhancements2] in {
+ def VSLD : TernaryVRId<"vsld", 0xE786, z_shl_double_bit, v128b, v128b, 0>;
+ def : Pat<(int_s390_vsld VR128:$x, VR128:$y, imm32zx8_timm:$z),
+ (VSLD VR128:$x, VR128:$y, imm32zx8:$z)>;
+ }
// Shift right arithmetic.
def VSRA : BinaryVRRc<"vsra", 0xE77E, int_s390_vsra, v128b, v128b>;
@@ -990,8 +993,11 @@ let Predicates = [FeatureVector] in {
def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, int_s390_vsrlb, v128b, v128b>;
// Shift right double by bit.
- let Predicates = [FeatureVectorEnhancements2] in
- def VSRD : TernaryVRId<"vsrd", 0xE787, int_s390_vsrd, v128b, v128b, 0>;
+ let Predicates = [FeatureVectorEnhancements2] in {
+ def VSRD : TernaryVRId<"vsrd", 0xE787, z_shr_double_bit, v128b, v128b, 0>;
+ def : Pat<(int_s390_vsrd VR128:$x, VR128:$y, imm32zx8_timm:$z),
+ (VSRD VR128:$x, VR128:$y, imm32zx8:$z)>;
+ }
// Subtract.
def VS : BinaryVRRcGeneric<"vs", 0xE7F7>;
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 8d7ee50c08742..39e216b993b11 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -354,6 +354,8 @@ def z_splat : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
def z_merge_high : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
def z_merge_low : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
def z_shl_double : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
+def z_shl_double_bit : SDNode<"SystemZISD::SHL_DOUBLE_BIT", SDT_ZVecTernaryInt>;
+def z_shr_double_bit : SDNode<"SystemZISD::SHR_DOUBLE_BIT", SDT_ZVecTernaryInt>;
def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS",
SDT_ZVecTernaryInt>;
def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
diff --git a/llvm/test/CodeGen/SystemZ/rot-03.ll b/llvm/test/CodeGen/SystemZ/rot-03.ll
index 22e4b13cc8d02..8f42439dabdf8 100644
--- a/llvm/test/CodeGen/SystemZ/rot-03.ll
+++ b/llvm/test/CodeGen/SystemZ/rot-03.ll
@@ -30,11 +30,7 @@ define i128 @f2(i128 %val) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepib %v1, 96
-; CHECK-NEXT: vrepib %v2, 32
-; CHECK-NEXT: vsrlb %v1, %v0, %v1
-; CHECK-NEXT: vslb %v0, %v0, %v2
-; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vsldb %v0, %v0, %v0, 4
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
@@ -55,10 +51,11 @@ define i128 @f3(i128 %val, i128 %amt) {
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepb %v1, %v1, 15
; CHECK-NEXT: vslb %v2, %v0, %v1
-; CHECK-NEXT: lhi %r1, 128
-; CHECK-NEXT: sr %r1, %r0
; CHECK-NEXT: vsl %v1, %v2, %v1
-; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: vrepib %v2, 1
+; CHECK-NEXT: xilf %r0, 4294967295
+; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
; CHECK-NEXT: vrepb %v2, %v2, 15
; CHECK-NEXT: vsrlb %v0, %v0, %v2
; CHECK-NEXT: vsrl %v0, %v0, %v2
diff --git a/llvm/test/CodeGen/SystemZ/shift-16.ll b/llvm/test/CodeGen/SystemZ/shift-16.ll
index d9d0e06ba262b..d81c3546998be 100644
--- a/llvm/test/CodeGen/SystemZ/shift-16.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-16.ll
@@ -7,23 +7,25 @@
define i256 @f1(i256 %a, i256 %sh) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: l %r0, 28(%r4)
; CHECK-NEXT: clijhe %r0, 128, .LBB0_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: lhi %r1, 128
-; CHECK-NEXT: sr %r1, %r0
-; CHECK-NEXT: vlvgp %v2, %r1, %r1
-; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vsrlb %v3, %v1, %v2
-; CHECK-NEXT: vsrl %v2, %v3, %v2
-; CHECK-NEXT: vlvgp %v3, %r0, %r0
-; CHECK-NEXT: vrepb %v3, %v3, 15
-; CHECK-NEXT: vslb %v4, %v0, %v3
+; CHECK-NEXT: lr %r1, %r0
+; CHECK-NEXT: xilf %r1, 4294967295
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vlvgp %v5, %r1, %r1
+; CHECK-NEXT: vrepib %v4, 1
+; CHECK-NEXT: vrepb %v3, %v2, 15
+; CHECK-NEXT: vsrl %v4, %v1, %v4
+; CHECK-NEXT: vrepb %v5, %v5, 15
+; CHECK-NEXT: vslb %v2, %v0, %v3
+; CHECK-NEXT: vsrlb %v4, %v4, %v5
; CHECK-NEXT: vslb %v1, %v1, %v3
-; CHECK-NEXT: vsl %v4, %v4, %v3
-; CHECK-NEXT: vo %v2, %v4, %v2
+; CHECK-NEXT: vsl %v2, %v2, %v3
+; CHECK-NEXT: vsrl %v4, %v4, %v5
+; CHECK-NEXT: vo %v2, %v2, %v4
; CHECK-NEXT: vsl %v1, %v1, %v3
; CHECK-NEXT: cijlh %r0, 0, .LBB0_3
; CHECK-NEXT: j .LBB0_4
@@ -49,22 +51,24 @@ define i256 @f1(i256 %a, i256 %sh) {
define i256 @f2(i256 %a, i256 %sh) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v0, 16(%r3), 3
; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v0, 16(%r3), 3
; CHECK-NEXT: l %r0, 28(%r4)
; CHECK-NEXT: clijhe %r0, 128, .LBB1_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: lhi %r1, 128
-; CHECK-NEXT: sr %r1, %r0
-; CHECK-NEXT: vlvgp %v2, %r1, %r1
-; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vslb %v3, %v1, %v2
-; CHECK-NEXT: vsl %v2, %v3, %v2
-; CHECK-NEXT: vlvgp %v3, %r0, %r0
-; CHECK-NEXT: vrepb %v3, %v3, 15
-; CHECK-NEXT: vsrlb %v4, %v0, %v3
+; CHECK-NEXT: lr %r1, %r0
+; CHECK-NEXT: xilf %r1, 4294967295
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vlvgp %v5, %r1, %r1
+; CHECK-NEXT: vrepib %v4, 1
+; CHECK-NEXT: vrepb %v3, %v2, 15
+; CHECK-NEXT: vsl %v4, %v1, %v4
+; CHECK-NEXT: vrepb %v5, %v5, 15
+; CHECK-NEXT: vsrlb %v2, %v0, %v3
+; CHECK-NEXT: vslb %v4, %v4, %v5
; CHECK-NEXT: vsrlb %v1, %v1, %v3
-; CHECK-NEXT: vsrl %v4, %v4, %v3
+; CHECK-NEXT: vsrl %v2, %v2, %v3
+; CHECK-NEXT: vsl %v4, %v4, %v5
; CHECK-NEXT: vo %v2, %v4, %v2
; CHECK-NEXT: vsrl %v1, %v1, %v3
; CHECK-NEXT: cijlh %r0, 0, .LBB1_3
@@ -92,23 +96,25 @@ define i256 @f3(i256 %a, i256 %sh) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 16(%r3), 3
-; CHECK-NEXT: vl %v2, 0(%r3), 3
; CHECK-NEXT: l %r0, 28(%r4)
+; CHECK-NEXT: vl %v2, 0(%r3), 3
; CHECK-NEXT: clijhe %r0, 128, .LBB2_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: lhi %r1, 128
-; CHECK-NEXT: sr %r1, %r0
; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vlvgp %v4, %r1, %r1
; CHECK-NEXT: vrepb %v3, %v1, 15
-; CHECK-NEXT: vrepb %v4, %v4, 15
; CHECK-NEXT: vsrab %v1, %v2, %v3
-; CHECK-NEXT: vslb %v2, %v2, %v4
-; CHECK-NEXT: vsl %v2, %v2, %v4
; CHECK-NEXT: vsrlb %v4, %v0, %v3
; CHECK-NEXT: vsra %v1, %v1, %v3
+; CHECK-NEXT: lr %r1, %r0
; CHECK-NEXT: vsrl %v3, %v4, %v3
-; CHECK-NEXT: vo %v2, %v3, %v2
+; CHECK-NEXT: vrepib %v4, 1
+; CHECK-NEXT: xilf %r1, 4294967295
+; CHECK-NEXT: vsl %v2, %v2, %v4
+; CHECK-NEXT: vlvgp %v4, %r1, %r1
+; CHECK-NEXT: vrepb %v4, %v4, 15
+; CHECK-NEXT: vslb %v2, %v2, %v4
+; CHECK-NEXT: vsl %v2, %v2, %v4
+; CHECK-NEXT: vo %v2, %v2, %v3
; CHECK-NEXT: cijlh %r0, 0, .LBB2_3
; CHECK-NEXT: j .LBB2_4
; CHECK-NEXT: .LBB2_2:
diff --git a/llvm/test/CodeGen/SystemZ/shift-17.ll b/llvm/test/CodeGen/SystemZ/shift-17.ll
new file mode 100644
index 0000000000000..45f4ed4d70d20
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/shift-17.ll
@@ -0,0 +1,251 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; Test 128-bit funnel shifts.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=Z15
+
+declare i128 @llvm.fshl.i128(i128, i128, i128)
+declare i128 @llvm.fshr.i128(i128, i128, i128)
+
+; Funnel shift left by constant multiple of 8.
+define i128 @f1(i128 %a, i128 %b) {
+; CHECK-LABEL: f1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vsldb %v0, %v1, %v0, 4
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f1:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsldb %v0, %v1, %v0, 4
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 32)
+ ret i128 %res
+}
+
+; Funnel shift left by constant smaller than 8.
+define i128 @f2(i128 %a, i128 %b) {
+; CHECK-LABEL: f2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vrepib %v2, 5
+; CHECK-NEXT: vsl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 123
+; CHECK-NEXT: vsrlb %v0, %v0, %v2
+; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f2:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsld %v0, %v1, %v0, 5
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 5)
+ ret i128 %res
+}
+
+; Funnel shift left by some other constant.
+define i128 @f3(i128 %a, i128 %b) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vrepib %v2, 86
+; CHECK-NEXT: vsrlb %v1, %v1, %v2
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vsrl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 42
+; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f3:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsldb %v1, %v1, %v0, 5
+; Z15-NEXT: vsldb %v0, %v0, %v0, 5
+; Z15-NEXT: vsld %v0, %v1, %v0, 2
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 42)
+ ret i128 %res
+}
+
+; Funnel shift left by a variable amount.
+define i128 @f4(i128 %a, i128 %b, i128 %sh) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: l %r0, 12(%r5)
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vrepb %v2, %v2, 15
+; CHECK-NEXT: vslb %v1, %v1, %v2
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vsl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 1
+; CHECK-NEXT: xilf %r0, 4294967295
+; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vrepb %v2, %v2, 15
+; CHECK-NEXT: vsrlb %v0, %v0, %v2
+; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f4:
+; Z15: # %bb.0:
+; Z15-NEXT: l %r0, 12(%r5)
+; Z15-NEXT: vlvgp %v2, %r0, %r0
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vrepb %v2, %v2, 15
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vslb %v1, %v1, %v2
+; Z15-NEXT: vsl %v1, %v1, %v2
+; Z15-NEXT: vrepib %v2, 1
+; Z15-NEXT: xilf %r0, 4294967295
+; Z15-NEXT: vsrl %v0, %v0, %v2
+; Z15-NEXT: vlvgp %v2, %r0, %r0
+; Z15-NEXT: vrepb %v2, %v2, 15
+; Z15-NEXT: vsrlb %v0, %v0, %v2
+; Z15-NEXT: vsrl %v0, %v0, %v2
+; Z15-NEXT: vo %v0, %v1, %v0
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 %sh)
+ ret i128 %res
+}
+
+; Funnel shift right by constant multiple of 8.
+define i128 @f5(i128 %a, i128 %b) {
+; CHECK-LABEL: f5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vsldb %v0, %v1, %v0, 12
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f5:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsldb %v0, %v1, %v0, 12
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 32)
+ ret i128 %res
+}
+
+; Funnel shift right by constant smaller than 8.
+define i128 @f6(i128 %a, i128 %b) {
+; CHECK-LABEL: f6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vrepib %v2, 5
+; CHECK-NEXT: vsrl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 123
+; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f6:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsrd %v0, %v1, %v0, 5
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 5)
+ ret i128 %res
+}
+
+; Funnel shift right by some other constant.
+define i128 @f7(i128 %a, i128 %b) {
+; CHECK-LABEL: f7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vrepib %v2, 42
+; CHECK-NEXT: vsrlb %v1, %v1, %v2
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vsrl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 86
+; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f7:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsldb %v0, %v1, %v0, 11
+; Z15-NEXT: vsldb %v1, %v1, %v1, 11
+; Z15-NEXT: vsrd %v0, %v1, %v0, 2
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 42)
+ ret i128 %res
+}
+
+; Funnel shift right by a variable amount.
+define i128 @f8(i128 %a, i128 %b, i128 %sh) {
+; CHECK-LABEL: f8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: l %r0, 12(%r5)
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vrepb %v2, %v2, 15
+; CHECK-NEXT: vsrlb %v1, %v1, %v2
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vsrl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 1
+; CHECK-NEXT: xilf %r0, 4294967295
+; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vrepb %v2, %v2, 15
+; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f8:
+; Z15: # %bb.0:
+; Z15-NEXT: l %r0, 12(%r5)
+; Z15-NEXT: vlvgp %v2, %r0, %r0
+; Z15-NEXT: vl %v1, 0(%r4), 3
+; Z15-NEXT: vrepb %v2, %v2, 15
+; Z15-NEXT: vl %v0, 0(%r3), 3
+; Z15-NEXT: vsrlb %v1, %v1, %v2
+; Z15-NEXT: vsrl %v1, %v1, %v2
+; Z15-NEXT: vrepib %v2, 1
+; Z15-NEXT: xilf %r0, 4294967295
+; Z15-NEXT: vsl %v0, %v0, %v2
+; Z15-NEXT: vlvgp %v2, %r0, %r0
+; Z15-NEXT: vrepb %v2, %v2, 15
+; Z15-NEXT: vslb %v0, %v0, %v2
+; Z15-NEXT: vsl %v0, %v0, %v2
+; Z15-NEXT: vo %v0, %v0, %v1
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 %sh)
+ ret i128 %res
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index 249136af1c737..f7bbad9055afd 100644
--- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -137,33 +137,29 @@ define void @fun2(<8 x i32> %src, ptr %p)
; CHECK-NEXT: vrepib %v5, 58
; CHECK-NEXT: vsrlb %v1, %v1, %v4
; CHECK-NEXT: vsteg %v1, 16(%r2), 1
-; CHECK-NEXT: vrepib %v1, 120
-; CHECK-NEXT: vrepib %v4, 89
-; CHECK-NEXT: vsrlb %v1, %v3, %v1
-; CHECK-NEXT: vlvgp %v3, %r0, %r0
+; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vlgvf %r0, %v24, 1
-; CHECK-NEXT: vslb %v3, %v3, %v4
-; CHECK-NEXT: vsl %v3, %v3, %v4
+; CHECK-NEXT: vrepib %v4, 89
+; CHECK-NEXT: vslb %v1, %v1, %v4
+; CHECK-NEXT: vsl %v1, %v1, %v4
; CHECK-NEXT: vlvgp %v4, %r0, %r0
; CHECK-NEXT: vlgvf %r0, %v24, 2
; CHECK-NEXT: vn %v4, %v4, %v2
; CHECK-NEXT: vslb %v4, %v4, %v5
; CHECK-NEXT: vsl %v4, %v4, %v5
-; CHECK-NEXT: vo %v3, %v3, %v4
+; CHECK-NEXT: vo %v1, %v1, %v4
; CHECK-NEXT: vlvgp %v4, %r0, %r0
; CHECK-NEXT: vn %v2, %v4, %v2
; CHECK-NEXT: vrepib %v4, 27
; CHECK-NEXT: vslb %v2, %v2, %v4
; CHECK-NEXT: vsl %v2, %v2, %v4
-; CHECK-NEXT: vo %v2, %v3, %v2
-; CHECK-NEXT: vl %v3, 0(%r1), 3
-; CHECK-NEXT: vn %v0, %v0, %v3
-; CHECK-NEXT: vrepib %v3, 4
-; CHECK-NEXT: vsrl %v0, %v0, %v3
-; CHECK-NEXT: vo %v0, %v2, %v0
-; CHECK-NEXT: vrepib %v2, 8
-; CHECK-NEXT: vslb %v0, %v0, %v2
-; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vo %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 0(%r1), 3
+; CHECK-NEXT: vn %v0, %v0, %v2
+; CHECK-NEXT: vrepib %v2, 4
+; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vsldb %v0, %v0, %v3, 1
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
{
More information about the llvm-commits
mailing list