[llvm] s390x: optimize 128-bit fshl and fshr by high values (PR #154919)
Folkert de Vries via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 22 03:19:53 PDT 2025
https://github.com/folkertdev created https://github.com/llvm/llvm-project/pull/154919
Turn a funnel shift by N in the range `121..128` into a funnel shift in the opposite direction by `128 - N`. Because there are dedicated instructions for funnel shifts by values smaller than 8, this emits fewer instructions.
This additional rule is useful because LLVM appears to canonicalize `fshr` into `fshl`, meaning that the rules for `fshr` on values less than 8 would not match on organic input.
I reported this in https://github.com/llvm/llvm-project/issues/129955#issuecomment-3207488190, where a `fshr(a, b, 5)` is canonicalized into `fshl(a, b, 123)`.
https://godbolt.org/z/ossMvr31E
``` llvm
define <16 x i8> @vec_sld_manual(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
%0 = bitcast <16 x i8> %a to i128
%1 = bitcast <16 x i8> %b to i128
%_3 = tail call i128 @llvm.fshl.i128(i128 %0, i128 %1, i128 123)
%2 = bitcast i128 %_3 to <16 x i8>
ret <16 x i8> %2
}
define <16 x i8> @vec_sld_builtin(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
%_0 = tail call <16 x i8> @llvm.s390.vsrd(<16 x i8> %a, <16 x i8> %b, i32 noundef 5) #4
ret <16 x i8> %_0
}
declare <16 x i8> @llvm.s390.vsrd(<16 x i8>, <16 x i8>, i32 immarg) unnamed_addr #2
declare i128 @llvm.fshl.i128(i128, i128, i128) #3
```
resulting in
```asm
vec_sld_manual:
vsldb %v0, %v24, %v26, 15
vsldb %v1, %v26, %v26, 15
vsld %v24, %v0, %v1, 3
br %r14
vec_sld_builtin:
vsrd %v24, %v24, %v26, 5
br %r14
```
with this PR both functions generate the same, smaller assembly.
---
cc @uweigand
>From 87ec3241063980ee7423d2365c872f7dd07f6cc0 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Fri, 22 Aug 2025 11:20:02 +0200
Subject: [PATCH] s390x: optimize 128-bit fshl and fshr by high values
Turn a funnel shift by N in the range 121..128 into a funnel shift in
the opposite direction by `128 - N`. Because there are dedicated
instructions for funnel shifts by values smaller than 8, this emits
fewer instructions.
This additional rule is useful because LLVM appears to canonicalize
`fshr` into `fshl`, meaning that the rules for `fshr` on values less
than 8 would not match on organic input.
---
.../Target/SystemZ/SystemZISelLowering.cpp | 91 ++++++++++++-------
llvm/test/CodeGen/SystemZ/shift-17.ll | 51 +++++++++++
2 files changed, 107 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c73dc3021eb42..2c8b4f478a6a4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6706,30 +6706,70 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
return Op;
}
-SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
SDLoc DL(Op);
+ // A 128-bit FSHR by small N is canonicalized to a fshl (128 - N).
+ // Convert back to a FSHR make to use of SHR_DOUBLE_BIT.
+ if (ShiftAmt > 120)
+ return lowerFSHRHelp(Op, DAG, 128 - ShiftAmt);
+
// i128 FSHL with a constant amount that is a multiple of 8 can be
- // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+ // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
// facility, FSHL with a constant amount less than 8 can be implemented
// via SHL_DOUBLE_BIT, and FSHL with other constant amounts by a
// combination of the two.
+ SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+ SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ SmallVector<int, 16> Mask(16);
+ for (unsigned Elt = 0; Elt < 16; Elt++)
+ Mask[Elt] = (ShiftAmt >> 3) + Elt;
+ SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+ if ((ShiftAmt & 7) == 0)
+ return DAG.getBitcast(MVT::i128, Shuf1);
+ SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
+ DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
+ SDLoc DL(Op);
+
+ // Unlikely to come up organically because LLVM prefers FSHL.
+ // Convert back to a FSHL make to use of SHL_DOUBLE_BIT.
+ if (ShiftAmt > 120)
+ return lowerFSHLHelp(Op, DAG, 128 - ShiftAmt);
+
+ // i128 FSHR with a constant amount that is a multiple of 8 can be
+ // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+ // facility, FSHR with a constant amount less than 8 can be implemented
+ // via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a
+ // combination of the two.
+ SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+ SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ SmallVector<int, 16> Mask(16);
+ for (unsigned Elt = 0; Elt < 16; Elt++)
+ Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
+ SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+ if ((ShiftAmt & 7) == 0)
+ return DAG.getBitcast(MVT::i128, Shuf1);
+ SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
+ DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
- SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
- SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
- SmallVector<int, 16> Mask(16);
- for (unsigned Elt = 0; Elt < 16; Elt++)
- Mask[Elt] = (ShiftAmt >> 3) + Elt;
- SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
- if ((ShiftAmt & 7) == 0)
- return DAG.getBitcast(MVT::i128, Shuf1);
- SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
- SDValue Val =
- DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
- DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
- return DAG.getBitcast(MVT::i128, Val);
+ return lowerFSHLHelp(Op, DAG, ShiftAmt);
}
}
@@ -6737,29 +6777,10 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
-
- // i128 FSHR with a constant amount that is a multiple of 8 can be
- // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
- // facility, FSHR with a constant amount less than 8 can be implemented
- // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
- // combination of the two.
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
- SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
- SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
- SmallVector<int, 16> Mask(16);
- for (unsigned Elt = 0; Elt < 16; Elt++)
- Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
- SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
- if ((ShiftAmt & 7) == 0)
- return DAG.getBitcast(MVT::i128, Shuf1);
- SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
- SDValue Val =
- DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
- DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
- return DAG.getBitcast(MVT::i128, Val);
+ return lowerFSHRHelp(Op, DAG, ShiftAmt);
}
}
diff --git a/llvm/test/CodeGen/SystemZ/shift-17.ll b/llvm/test/CodeGen/SystemZ/shift-17.ll
index 45f4ed4d70d20..8f5f9abd0540b 100644
--- a/llvm/test/CodeGen/SystemZ/shift-17.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-17.ll
@@ -249,3 +249,54 @@ define i128 @f8(i128 %a, i128 %b, i128 %sh) {
ret i128 %res
}
+; Funnel shift left by constant N in 121..128, in such cases fshl N == fshr (128 - N)
+define i128 @f9(i128 %a, i128 %b) {
+; CHECK-LABEL: f9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vrepib %v2, 5
+; CHECK-NEXT: vsrl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 123
+; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f9:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsrd %v0, %v1, %v0, 5
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 123)
+ ret i128 %res
+}
+
+; Funnel shift right by constant N in 121..128, in such cases fshr N == fshl (128 - N)
+define i128 @f10(i128 %a, i128 %b) {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vrepib %v2, 5
+; CHECK-NEXT: vsl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 123
+; CHECK-NEXT: vsrlb %v0, %v0, %v2
+; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f10:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsld %v0, %v1, %v0, 5
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 123)
+ ret i128 %res
+}
More information about the llvm-commits
mailing list