[llvm] [PowerPC] Dag Combine to merge vsr(vsro(in, shift), shift) to vsrq(input,shift) (PR #154388)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 19 10:10:52 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: Tony Varghese (tonykuttai)
<details>
<summary>Changes</summary>
This change implements a dag combiner that combines consecutive `VSRO (Vector Shift Right Octet)` and `VSR (Vector Shift Right)` instructions into a single `VSRQ (Vector Shift Right Quadword)` instruction on Power10+ processors. Vector right shift operations like `vec_srl(vec_sro(v, s), s)` generate two separate instructions `(VSRO + VSR)` when they could be optimized into a single `VSRQ `instruction that performs the equivalent operation.
Note:
```
vsro : Vector Shift Right by Octet VX-form
- vsro VRT, VRA, VRB
- The contents of VSR[VRA+32] are shifted right by the number of bytes specified in bits 121:124 of VSR[VRB+32].
- Bytes shifted out of byte 15 are lost.
- Zeros are supplied to the vacated bytes on the left.
- The result is placed into VSR[VRT+32].
vsr : Vector Shift Right VX-form
- vsr VRT, VRA, VRB
- The contents of VSR[VRA+32] are shifted right by the number of bits specified in bits 125:127 of VSR[VRB+32]. 3 bits.
- Bits shifted out of bit 127 are lost.
- Zeros are supplied to the vacated bits on the left.
- The result is place into VSR[VRT+32], except if, for any byte element in VSR[VRB+32], the low-order 3 bits are not equal to the shift amount, then VSR[VRT+32] is undefined.
vsrq : Vector Shift Right Quadword VX-form
- vsrq VRT,VRA,VRB
- Let src1 be the contents of VSR[VRA+32]. Let src2 be the contents of VSR[VRB+32].
- src1 is shifted right by the number of bits specified in the low-order 7 bits of src2.
- Bits shifted out the least-significant bit are lost.
- Zeros are supplied to the vacated bits on the left.
- The result is placed into VSR[VRT+32].
```
---
Patch is 20.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154388.diff
5 Files Affected:
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+66)
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.h (+4)
- (modified) llvm/lib/Target/PowerPC/PPCInstrInfo.td (+6)
- (modified) llvm/lib/Target/PowerPC/PPCInstrP10.td (+2-1)
- (added) llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll (+337)
``````````diff
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 652edd4e04c60..c7c17c2d0f85a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1697,6 +1697,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::XXPERM:
return "PPCISD::XXPERM";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
+ case PPCISD::VSRQ: return "PPCISD::VSRQ";
case PPCISD::CMPB: return "PPCISD::CMPB";
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
@@ -16680,6 +16681,67 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
llvm_unreachable("Expected a load or store node here");
}
+// Combine VSR(VSRO input, shift), shift) to VSRQ(input, shift)
+//
+// PowerPC Vector Shift Instructions:
+// - vsro (Vector Shift Right by Octet): Shifts vector right by N bytes,
+// where N is specified in bits 121:124 of the shift vector (4 bits, 0-15
+// bytes)
+// - vsr (Vector Shift Right): Shifts vector right by N bits,
+// where N is specified in bits 125:127 of the shift vector (3 bits, 0-7 bits)
+// - vsrq (Vector Shift Right Quadword): Shifts vector right by N bits,
+// where N is specified in bits 57:63 of the shift vector (7 bits, 0-127 bits)
+//
+// Input DAG pattern: vsr(vsro(input, shift_vector), shift_vector)
+// performs the following shifts:
+// 1. vsro: input >> (bits[121:124] * 8) bits [byte shifts converted to
+// bits]
+// 2. vsr: result >> bits[125:127] bits [additional bit shifts]
+// Total shift = (bits[121:124] * 8) + bits[125:127] bits
+//
+// Since bits 121:127 form a 7-bit value representing the total shift amount,
+// and vsrq uses the same 7-bit shift amount (assuming bits 57:63 map to
+// 121:127), we can replace the two-instruction sequence with a single vsrq
+// instruction.
+//
+// Optimization: vsr(vsro(input, shift), shift) -> vsrq(input, shift)
+SDValue PPCTargetLowering::combineVSROVSRToVSRQ(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ // Only available on ISA 3.1+ (Power10+)
+ if (!Subtarget.isISA3_1())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue VSRInput = N->getOperand(1);
+ SDValue VSRShift = N->getOperand(2);
+
+ // Check if VSR input comes from a VSRO intrinsic
+ if (VSRInput.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+ return SDValue();
+
+ unsigned VSROIntrinsicID = VSRInput->getConstantOperandVal(0);
+ if (VSROIntrinsicID != Intrinsic::ppc_altivec_vsro)
+ return SDValue();
+
+ // Check if VSRO uses the same shift amount register as VSR
+ SDValue VSROShift = VSRInput.getOperand(2);
+ if (VSRShift != VSROShift)
+ return SDValue();
+
+ // Check single use - VSRO result should only be used by this VSR
+ if (!VSRInput.hasOneUse())
+ return SDValue();
+
+ // Get the original input to VSRO instruction
+ SDValue VSROOrigInput = VSRInput.getOperand(1);
+
+ return DAG.getNode(PPCISD::VSRQ, SDLoc(N),
+ N->getValueType(0), // Preserve original result type
+ VSROOrigInput, // Original input vector
+ VSRShift); // Shift amount
+}
+
static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
unsigned IntrinsicID = Intrin.getConstantOperandVal(1);
if (IntrinsicID == Intrinsic::ppc_stdcx)
@@ -17207,6 +17269,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
}
}
+
+ // combine VSRO + VSR intrinsic calls to optimize with VSRQ
+ if (IID == Intrinsic::ppc_altivec_vsr)
+ return combineVSROVSRToVSRQ(N, DCI);
}
break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 5e0d6bf184f20..362ccdfb26efa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -498,6 +498,9 @@ namespace llvm {
/// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
SETBCR,
+ /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
+ VSRQ,
+
// NOTE: The nodes below may require PC-Rel specific patterns if the
// address could be PC-Relative. When adding new nodes below, consider
// whether or not the address can be PC-Relative and add the corresponding
@@ -1447,6 +1450,7 @@ namespace llvm {
SelectionDAG &DAG) const;
SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
DAGCombinerInfo &DCI) const;
+ SDValue combineVSROVSRToVSRQ(SDNode *N, DAGCombinerInfo &DCI) const;
/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
/// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index c2f91ce8e6b96..1a57d622a5f6c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -58,6 +58,10 @@ def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
]>;
+def SDT_PPCVecShiftQuad : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>
+]>;
+
def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
]>;
@@ -157,6 +161,8 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
+
def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
SDTFPUnaryOp, [SDNPHasChain]>;
def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 98dd8464c0ac8..902c40544ac28 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -1919,7 +1919,8 @@ let Predicates = [IsISA3_1] in {
RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
- def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+ def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq",
+ [(set v4i32:$VD, (PPCvsrq v4i32:$VA, v4i32:$VB))]>;
def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
diff --git a/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
new file mode 100644
index 0000000000000..c2599c8f6af13
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
@@ -0,0 +1,337 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER10-LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER10-BE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER3210-BE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER9-LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER9-BE
+
+; Test VSRO + VSR peephole optimization to VSRQ on Power10+
+; This should combine consecutive VSRO (Vector Shift Right Octet) and VSR (Vector Shift Right)
+; instructions using the same shift amount into a single VSRQ (Vector Shift Right Quadword)
+; instruction when targeting Power10 or later processors.
+declare <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32>, <4 x i32>)
+
+define <16 x i8> @shiftright128_v16i8(<16 x i8> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v16i8:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v16i8:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v16i8:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v16i8:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v16i8:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+ %4 = bitcast <4 x i32> %3 to <16 x i8>
+ ret <16 x i8> %4
+}
+
+define <4 x i32> @shiftright128_v4i32(<4 x i32> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v4i32:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v4i32:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v4i32:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v4i32:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v4i32:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %1 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %in, <4 x i32> %0)
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %1, <4 x i32> %0)
+ ret <4 x i32> %2
+}
+
+define <2 x i64> @shiftright128_v2i64(<2 x i64> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v2i64:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v2i64:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v2i64:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v2i64:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v2i64:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <2 x i64> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+ %4 = bitcast <4 x i32> %3 to <2 x i64>
+ ret <2 x i64> %4
+}
+
+define <8 x i16> @shiftright128_v8i16(<8 x i16> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v8i16:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v8i16:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v8i16:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v8i16:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v8i16:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <8 x i16> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+ %4 = bitcast <4 x i32> %3 to <8 x i16>
+ ret <8 x i16> %4
+}
+
+; Test case with different vectors (should not optimize - different shift amounts)
+define <16 x i8> @no_optimization_different_shifts(<16 x i8> %in, i8 zeroext %sh1, i8 zeroext %sh2) {
+; POWER10-LE-LABEL: no_optimization_different_shifts:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: mtvsrd v4, r6
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vspltb v4, v4, 7
+; POWER10-LE-NEXT: vsro v2, v2, v3
+; POWER10-LE-NEXT: vsr v2, v2, v4
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: no_optimization_different_shifts:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: mtvsrwz v4, r4
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vspltb v4, v4, 7
+; POWER10-BE-NEXT: vsro v2, v2, v3
+; POWER10-BE-NEXT: vsr v2, v2, v4
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: no_optimization_different_shifts:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: mtvsrwz v4, r4
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vspltb v4, v4, 7
+; POWER3210-BE-NEXT: vsro v2, v2, v3
+; POWER3210-BE-NEXT: vsr v2, v2, v4
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: no_optimization_different_shifts:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: mtvsrd v4, r6
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vspltb v4, v4, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v4
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: no_optimization_different_shifts:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: mtvsrwz v4, r4
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vspltb v4, v4, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v4
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh1, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %splat.splatinsert.i2 = insertelement <16 x i8> poison, i8 %sh2, i64 0
+ %splat.splat.i2 = shufflevector <16 x i8> %splat.splatinsert.i2, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = bitcast <16 x i8> %splat.splat.i2 to <4 x i32>
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %4 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %3, <4 x i32> %2)
+ %5 = bitcast <4 x i32> %4 to <16 x i8>
+ ret <16 x i8> %5
+}
+
+; Test case with multiple uses of VSRO result (should not optimize)
+define <16 x i8> @no_optimization_multiple_uses(<16 x i8> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: no_optimization_multiple_uses:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsro v2, v2, v3
+; POWER10-LE-NEXT: vsr v3, v2, v3
+; POWER10-LE-NEXT: vaddubm v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: no_optimization_multiple_uses:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsro v2, v2, v3
+; POWER10-BE-NEXT: vsr v3, v2, v3
+; POWER10-BE-NEXT: vaddubm v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: no_optimization_multiple_uses:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsro v2, v2, v3
+; POWER3210-BE-NEXT: vsr v3, v2, v3
+; POWER3210-BE-NEXT: vaddubm v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: no_optimization_multiple_uses:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v3, v2, v3
+; POWER9-LE-NEXT: vaddubm v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: no_optimization_multiple_uses:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v3, v2, v3
+; POWER9-BE-NEXT: vaddubm v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154388
More information about the llvm-commits
mailing list