[llvm] s390x: optimize 128-bit fshl and fshr by high values (PR #154919)

Fri Aug 22 03:19:53 PDT 2025

https://github.com/folkertdev created https://github.com/llvm/llvm-project/pull/154919

Turn a funnel shift by N in the range `121..128` into a funnel shift in the opposite direction by `128 - N`. Because there are dedicated instructions for funnel shifts by values smaller than 8, this emits fewer instructions.

This additional rule is useful because LLVM appears to canonicalize `fshr` into `fshl`, meaning that the rules for `fshr` on values less than 8 would not match on organic input.

I reported this in https://github.com/llvm/llvm-project/issues/129955#issuecomment-3207488190, where a `fshr(a, b, 5)` is canonicalized into `fshl(a, b, 123)`.

https://godbolt.org/z/ossMvr31E

``` llvm
define <16 x i8> @vec_sld_manual(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
  %0 = bitcast <16 x i8> %a to i128
  %1 = bitcast <16 x i8> %b to i128
  %_3 = tail call i128 @llvm.fshl.i128(i128 %0, i128 %1, i128 123)
  %2 = bitcast i128 %_3 to <16 x i8>
  ret <16 x i8> %2
}

define <16 x i8> @vec_sld_builtin(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
  %_0 = tail call <16 x i8> @llvm.s390.vsrd(<16 x i8> %a, <16 x i8> %b, i32 noundef 5) #4
  ret <16 x i8> %_0
}

declare <16 x i8> @llvm.s390.vsrd(<16 x i8>, <16 x i8>, i32 immarg) unnamed_addr #2

declare i128 @llvm.fshl.i128(i128, i128, i128) #3
```
resulting in
```asm
vec_sld_manual:
        vsldb   %v0, %v24, %v26, 15
        vsldb   %v1, %v26, %v26, 15
        vsld    %v24, %v0, %v1, 3
        br      %r14

vec_sld_builtin:
        vsrd    %v24, %v24, %v26, 5
        br      %r14
```

with this PR both functions generate the same, smaller assembly.

---

cc @uweigand 

>From 87ec3241063980ee7423d2365c872f7dd07f6cc0 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Fri, 22 Aug 2025 11:20:02 +0200
Subject: [PATCH] s390x: optimize 128-bit fshl and fshr by high values

Turn a funnel shift by N in the range 121..128 into a funnel shift in
the opposite direction by `128 - N`. Because there are dedicated
instructions for funnel shifts by values smaller than 8, this emits
fewer instructions.

This additional rule is useful because LLVM appears to canonicalize
`fshr` into `fshl`, meaning that the rules for `fshr` on values less
than 8 would not match on organic input.
---
 .../Target/SystemZ/SystemZISelLowering.cpp    | 91 ++++++++++++-------
 llvm/test/CodeGen/SystemZ/shift-17.ll         | 51 +++++++++++
 2 files changed, 107 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c73dc3021eb42..2c8b4f478a6a4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6706,30 +6706,70 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
   return Op;
 }
 
-SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
   SDLoc DL(Op);
 
+  // A 128-bit FSHR by small N is canonicalized to a fshl (128 - N).
+  // Convert back to a FSHR make to use of SHR_DOUBLE_BIT.
+  if (ShiftAmt > 120)
+    return lowerFSHRHelp(Op, DAG, 128 - ShiftAmt);
+
   // i128 FSHL with a constant amount that is a multiple of 8 can be
-  // implemented via VECTOR_SHUFFLE.  If we have the vector-enhancements-2
+  // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
   // facility, FSHL with a constant amount less than 8 can be implemented
   // via SHL_DOUBLE_BIT, and FSHL with other constant amounts by a
   // combination of the two.
+  SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+  SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+  SmallVector<int, 16> Mask(16);
+  for (unsigned Elt = 0; Elt < 16; Elt++)
+    Mask[Elt] = (ShiftAmt >> 3) + Elt;
+  SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+  if ((ShiftAmt & 7) == 0)
+    return DAG.getBitcast(MVT::i128, Shuf1);
+  SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
+  SDValue Val =
+      DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
+                  DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+  return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
+  SDLoc DL(Op);
+
+  // Unlikely to come up organically because LLVM prefers FSHL.
+  // Convert back to a FSHL make to use of SHL_DOUBLE_BIT.
+  if (ShiftAmt > 120)
+    return lowerFSHLHelp(Op, DAG, 128 - ShiftAmt);
+
+  // i128 FSHR with a constant amount that is a multiple of 8 can be
+  // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+  // facility, FSHR with a constant amount less than 8 can be implemented
+  // via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a
+  // combination of the two.
+  SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+  SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+  SmallVector<int, 16> Mask(16);
+  for (unsigned Elt = 0; Elt < 16; Elt++)
+    Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
+  SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+  if ((ShiftAmt & 7) == 0)
+    return DAG.getBitcast(MVT::i128, Shuf1);
+  SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
+  SDValue Val =
+      DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
+                  DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+  return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
   if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
     uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
     if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
-      SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
-      SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
-      SmallVector<int, 16> Mask(16);
-      for (unsigned Elt = 0; Elt < 16; Elt++)
-        Mask[Elt] = (ShiftAmt >> 3) + Elt;
-      SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
-      if ((ShiftAmt & 7) == 0)
-        return DAG.getBitcast(MVT::i128, Shuf1);
-      SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
-      SDValue Val =
-          DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
-                      DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
-      return DAG.getBitcast(MVT::i128, Val);
+      return lowerFSHLHelp(Op, DAG, ShiftAmt);
     }
   }
 
@@ -6737,29 +6777,10 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-
-  // i128 FSHR with a constant amount that is a multiple of 8 can be
-  // implemented via VECTOR_SHUFFLE.  If we have the vector-enhancements-2
-  // facility, FSHR with a constant amount less than 8 can be implemented
-  // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
-  // combination of the two.
   if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
     uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
     if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
-      SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
-      SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
-      SmallVector<int, 16> Mask(16);
-      for (unsigned Elt = 0; Elt < 16; Elt++)
-        Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
-      SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
-      if ((ShiftAmt & 7) == 0)
-        return DAG.getBitcast(MVT::i128, Shuf1);
-      SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
-      SDValue Val =
-          DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
-                      DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
-      return DAG.getBitcast(MVT::i128, Val);
+      return lowerFSHRHelp(Op, DAG, ShiftAmt);
     }
   }
 
diff --git a/llvm/test/CodeGen/SystemZ/shift-17.ll b/llvm/test/CodeGen/SystemZ/shift-17.ll
index 45f4ed4d70d20..8f5f9abd0540b 100644
--- a/llvm/test/CodeGen/SystemZ/shift-17.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-17.ll
@@ -249,3 +249,54 @@ define i128 @f8(i128 %a, i128 %b, i128 %sh) {
   ret i128 %res
 }
 
+; Funnel shift left by constant N in 121..128, in such cases fshl N == fshr (128 - N)
+define i128 @f9(i128 %a, i128 %b) {
+; CHECK-LABEL: f9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v1, 0(%r4), 3
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vrepib %v2, 5
+; CHECK-NEXT:    vsrl %v1, %v1, %v2
+; CHECK-NEXT:    vrepib %v2, 123
+; CHECK-NEXT:    vslb %v0, %v0, %v2
+; CHECK-NEXT:    vsl %v0, %v0, %v2
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+;
+; Z15-LABEL: f9:
+; Z15:       # %bb.0:
+; Z15-NEXT:    vl %v0, 0(%r4), 3
+; Z15-NEXT:    vl %v1, 0(%r3), 3
+; Z15-NEXT:    vsrd %v0, %v1, %v0, 5
+; Z15-NEXT:    vst %v0, 0(%r2), 3
+; Z15-NEXT:    br %r14
+  %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 123)
+  ret i128 %res
+}
+
+; Funnel shift right by constant N in 121..128, in such cases fshr N == fshl (128 - N)
+define i128 @f10(i128 %a, i128 %b) {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v1, 0(%r3), 3
+; CHECK-NEXT:    vl %v0, 0(%r4), 3
+; CHECK-NEXT:    vrepib %v2, 5
+; CHECK-NEXT:    vsl %v1, %v1, %v2
+; CHECK-NEXT:    vrepib %v2, 123
+; CHECK-NEXT:    vsrlb %v0, %v0, %v2
+; CHECK-NEXT:    vsrl %v0, %v0, %v2
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+;
+; Z15-LABEL: f10:
+; Z15:       # %bb.0:
+; Z15-NEXT:    vl %v0, 0(%r4), 3
+; Z15-NEXT:    vl %v1, 0(%r3), 3
+; Z15-NEXT:    vsld %v0, %v1, %v0, 5
+; Z15-NEXT:    vst %v0, 0(%r2), 3
+; Z15-NEXT:    br %r14
+  %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 123)
+  ret i128 %res
+}