[llvm] [PowerPC] Merge vsr(vsro(input, byte_shift), bit_shift) to vsrq(input, res_bit_shift) (PR #154388)

Thu Aug 21 22:55:26 PDT 2025

https://github.com/tonykuttai updated https://github.com/llvm/llvm-project/pull/154388

>From d288a264811a815d14b07e31378f466f0122e79a Mon Sep 17 00:00:00 2001
From: Tony Varghese <tony.varghese at ibm.com>
Date: Tue, 19 Aug 2025 12:05:47 -0500
Subject: [PATCH 1/4] [PowerPC] Dag Combine to merge vsr(vsro(in, shift),
 shift) to vsrq(input, shift)

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  67 ++++
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |   4 +
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |   6 +
 llvm/lib/Target/PowerPC/PPCInstrP10.td        |   3 +-
 .../PowerPC/vsro-vsr-vsrq-dag-combine.ll      | 337 ++++++++++++++++++
 5 files changed, 416 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 652edd4e04c60..41a9a5d827478 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1697,6 +1697,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::XXPERM:
     return "PPCISD::XXPERM";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
+  case PPCISD::VSRQ:
+    return "PPCISD::VSRQ";
   case PPCISD::CMPB:            return "PPCISD::CMPB";
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
@@ -16680,6 +16682,67 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
   llvm_unreachable("Expected a load or store node here");
 }
 
+// Combine VSR(VSRO input, shift), shift) to VSRQ(input, shift)
+//
+// PowerPC Vector Shift Instructions:
+// - vsro (Vector Shift Right by Octet): Shifts vector right by N bytes,
+//   where N is specified in bits 121:124 of the shift vector (4 bits, 0-15
+//   bytes)
+// - vsr (Vector Shift Right): Shifts vector right by N bits,
+//   where N is specified in bits 125:127 of the shift vector (3 bits, 0-7 bits)
+// - vsrq (Vector Shift Right Quadword): Shifts vector right by N bits,
+//   where N is specified in bits 57:63 of the shift vector (7 bits, 0-127 bits)
+//
+// Input DAG pattern: vsr(vsro(input, shift_vector), shift_vector)
+// performs the following shifts:
+//   1. vsro: input >> (bits[121:124] * 8) bits   [byte shifts converted to
+//   bits]
+//   2. vsr:  result >> bits[125:127] bits        [additional bit shifts]
+//   Total shift = (bits[121:124] * 8) + bits[125:127] bits
+//
+// Since bits 121:127 form a 7-bit value representing the total shift amount,
+// and vsrq uses the same 7-bit shift amount (assuming bits 57:63 map to
+// 121:127), we can replace the two-instruction sequence with a single vsrq
+// instruction.
+//
+// Optimization: vsr(vsro(input, shift), shift) -> vsrq(input, shift)
+SDValue PPCTargetLowering::combineVSROVSRToVSRQ(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+
+  // Only available on ISA 3.1+ (Power10+)
+  if (!Subtarget.isISA3_1())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue VSRInput = N->getOperand(1);
+  SDValue VSRShift = N->getOperand(2);
+
+  // Check if VSR input comes from a VSRO intrinsic
+  if (VSRInput.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+    return SDValue();
+
+  unsigned VSROIntrinsicID = VSRInput->getConstantOperandVal(0);
+  if (VSROIntrinsicID != Intrinsic::ppc_altivec_vsro)
+    return SDValue();
+
+  // Check if VSRO uses the same shift amount register as VSR
+  SDValue VSROShift = VSRInput.getOperand(2);
+  if (VSRShift != VSROShift)
+    return SDValue();
+
+  // Check single use - VSRO result should only be used by this VSR
+  if (!VSRInput.hasOneUse())
+    return SDValue();
+
+  // Get the original input to VSRO instruction
+  SDValue VSROOrigInput = VSRInput.getOperand(1);
+
+  return DAG.getNode(PPCISD::VSRQ, SDLoc(N),
+                     N->getValueType(0), // Preserve original result type
+                     VSROOrigInput,      // Original input vector
+                     VSRShift);          // Shift amount
+}
+
 static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
   unsigned IntrinsicID = Intrin.getConstantOperandVal(1);
   if (IntrinsicID == Intrinsic::ppc_stdcx)
@@ -17207,6 +17270,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
           }
         }
       }
+
+      // combine VSRO + VSR intrinsic calls to optimize with VSRQ
+      if (IID == Intrinsic::ppc_altivec_vsr)
+        return combineVSROVSRToVSRQ(N, DCI);
     }
 
     break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 5e0d6bf184f20..362ccdfb26efa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -498,6 +498,9 @@ namespace llvm {
     /// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
     SETBCR,
 
+    /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
+    VSRQ,
+
     // NOTE: The nodes below may require PC-Rel specific patterns if the
     // address could be PC-Relative. When adding new nodes below, consider
     // whether or not the address can be PC-Relative and add the corresponding
@@ -1447,6 +1450,7 @@ namespace llvm {
                                  SelectionDAG &DAG) const;
     SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
                                  DAGCombinerInfo &DCI) const;
+    SDValue combineVSROVSRToVSRQ(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index c2f91ce8e6b96..1a57d622a5f6c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -58,6 +58,10 @@ def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
 ]>;
 
+def SDT_PPCVecShiftQuad : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>
+]>;
+
 def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
 ]>;
@@ -157,6 +161,8 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
 def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
 def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 
+def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
+
 def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
                              SDTFPUnaryOp, [SDNPHasChain]>;
 def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 98dd8464c0ac8..902c40544ac28 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -1919,7 +1919,8 @@ let Predicates = [IsISA3_1] in {
                         RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
   def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
   def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
-  def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+  def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", 
+                            [(set v4i32:$VD, (PPCvsrq v4i32:$VA, v4i32:$VB))]>;
   def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
   def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
   def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
diff --git a/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
new file mode 100644
index 0000000000000..c2599c8f6af13
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
@@ -0,0 +1,337 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER10-LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER10-BE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER3210-BE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER9-LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER9-BE
+
+; Test VSRO + VSR peephole optimization to VSRQ on Power10+
+; This should combine consecutive VSRO (Vector Shift Right Octet) and VSR (Vector Shift Right)
+; instructions using the same shift amount into a single VSRQ (Vector Shift Right Quadword)
+; instruction when targeting Power10 or later processors.
+declare <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32>, <4 x i32>)
+
+define <16 x i8> @shiftright128_v16i8(<16 x i8> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v16i8:
+; POWER10-LE:       # %bb.0: # %entry
+; POWER10-LE-NEXT:    mtvsrd v3, r5
+; POWER10-LE-NEXT:    vspltb v3, v3, 7
+; POWER10-LE-NEXT:    vsrq v2, v2, v3
+; POWER10-LE-NEXT:    blr
+;
+; POWER10-BE-LABEL: shiftright128_v16i8:
+; POWER10-BE:       # %bb.0: # %entry
+; POWER10-BE-NEXT:    mtvsrwz v3, r3
+; POWER10-BE-NEXT:    vspltb v3, v3, 7
+; POWER10-BE-NEXT:    vsrq v2, v2, v3
+; POWER10-BE-NEXT:    blr
+;
+; POWER3210-BE-LABEL: shiftright128_v16i8:
+; POWER3210-BE:       # %bb.0: # %entry
+; POWER3210-BE-NEXT:    mtvsrwz v3, r3
+; POWER3210-BE-NEXT:    vspltb v3, v3, 7
+; POWER3210-BE-NEXT:    vsrq v2, v2, v3
+; POWER3210-BE-NEXT:    blr
+;
+; POWER9-LE-LABEL: shiftright128_v16i8:
+; POWER9-LE:       # %bb.0: # %entry
+; POWER9-LE-NEXT:    mtvsrd v3, r5
+; POWER9-LE-NEXT:    vspltb v3, v3, 7
+; POWER9-LE-NEXT:    vsro v2, v2, v3
+; POWER9-LE-NEXT:    vsr v2, v2, v3
+; POWER9-LE-NEXT:    blr
+;
+; POWER9-BE-LABEL: shiftright128_v16i8:
+; POWER9-BE:       # %bb.0: # %entry
+; POWER9-BE-NEXT:    mtvsrwz v3, r3
+; POWER9-BE-NEXT:    vspltb v3, v3, 7
+; POWER9-BE-NEXT:    vsro v2, v2, v3
+; POWER9-BE-NEXT:    vsr v2, v2, v3
+; POWER9-BE-NEXT:    blr
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+  %0 = bitcast <16 x i8> %in to <4 x i32>
+  %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+  %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+  %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+  %4 = bitcast <4 x i32> %3 to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <4 x i32> @shiftright128_v4i32(<4 x i32> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v4i32:
+; POWER10-LE:       # %bb.0: # %entry
+; POWER10-LE-NEXT:    mtvsrd v3, r5
+; POWER10-LE-NEXT:    vspltb v3, v3, 7
+; POWER10-LE-NEXT:    vsrq v2, v2, v3
+; POWER10-LE-NEXT:    blr
+;
+; POWER10-BE-LABEL: shiftright128_v4i32:
+; POWER10-BE:       # %bb.0: # %entry
+; POWER10-BE-NEXT:    mtvsrwz v3, r3
+; POWER10-BE-NEXT:    vspltb v3, v3, 7
+; POWER10-BE-NEXT:    vsrq v2, v2, v3
+; POWER10-BE-NEXT:    blr
+;
+; POWER3210-BE-LABEL: shiftright128_v4i32:
+; POWER3210-BE:       # %bb.0: # %entry
+; POWER3210-BE-NEXT:    mtvsrwz v3, r3
+; POWER3210-BE-NEXT:    vspltb v3, v3, 7
+; POWER3210-BE-NEXT:    vsrq v2, v2, v3
+; POWER3210-BE-NEXT:    blr
+;
+; POWER9-LE-LABEL: shiftright128_v4i32:
+; POWER9-LE:       # %bb.0: # %entry
+; POWER9-LE-NEXT:    mtvsrd v3, r5
+; POWER9-LE-NEXT:    vspltb v3, v3, 7
+; POWER9-LE-NEXT:    vsro v2, v2, v3
+; POWER9-LE-NEXT:    vsr v2, v2, v3
+; POWER9-LE-NEXT:    blr
+;
+; POWER9-BE-LABEL: shiftright128_v4i32:
+; POWER9-BE:       # %bb.0: # %entry
+; POWER9-BE-NEXT:    mtvsrwz v3, r3
+; POWER9-BE-NEXT:    vspltb v3, v3, 7
+; POWER9-BE-NEXT:    vsro v2, v2, v3
+; POWER9-BE-NEXT:    vsr v2, v2, v3
+; POWER9-BE-NEXT:    blr
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+  %0 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+  %1 = tail call  <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %in, <4 x i32> %0)
+  %2 = tail call  <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %1, <4 x i32> %0)
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @shiftright128_v2i64(<2 x i64> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v2i64:
+; POWER10-LE:       # %bb.0: # %entry
+; POWER10-LE-NEXT:    mtvsrd v3, r5
+; POWER10-LE-NEXT:    vspltb v3, v3, 7
+; POWER10-LE-NEXT:    vsrq v2, v2, v3
+; POWER10-LE-NEXT:    blr
+;
+; POWER10-BE-LABEL: shiftright128_v2i64:
+; POWER10-BE:       # %bb.0: # %entry
+; POWER10-BE-NEXT:    mtvsrwz v3, r3
+; POWER10-BE-NEXT:    vspltb v3, v3, 7
+; POWER10-BE-NEXT:    vsrq v2, v2, v3
+; POWER10-BE-NEXT:    blr
+;
+; POWER3210-BE-LABEL: shiftright128_v2i64:
+; POWER3210-BE:       # %bb.0: # %entry
+; POWER3210-BE-NEXT:    mtvsrwz v3, r3
+; POWER3210-BE-NEXT:    vspltb v3, v3, 7
+; POWER3210-BE-NEXT:    vsrq v2, v2, v3
+; POWER3210-BE-NEXT:    blr
+;
+; POWER9-LE-LABEL: shiftright128_v2i64:
+; POWER9-LE:       # %bb.0: # %entry
+; POWER9-LE-NEXT:    mtvsrd v3, r5
+; POWER9-LE-NEXT:    vspltb v3, v3, 7
+; POWER9-LE-NEXT:    vsro v2, v2, v3
+; POWER9-LE-NEXT:    vsr v2, v2, v3
+; POWER9-LE-NEXT:    blr
+;
+; POWER9-BE-LABEL: shiftright128_v2i64:
+; POWER9-BE:       # %bb.0: # %entry
+; POWER9-BE-NEXT:    mtvsrwz v3, r3
+; POWER9-BE-NEXT:    vspltb v3, v3, 7
+; POWER9-BE-NEXT:    vsro v2, v2, v3
+; POWER9-BE-NEXT:    vsr v2, v2, v3
+; POWER9-BE-NEXT:    blr
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+  %0 = bitcast <2 x i64> %in to <4 x i32>
+  %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+  %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+  %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+  %4 = bitcast <4 x i32> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define <8 x i16> @shiftright128_v8i16(<8 x i16> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v8i16:
+; POWER10-LE:       # %bb.0: # %entry
+; POWER10-LE-NEXT:    mtvsrd v3, r5
+; POWER10-LE-NEXT:    vspltb v3, v3, 7
+; POWER10-LE-NEXT:    vsrq v2, v2, v3
+; POWER10-LE-NEXT:    blr
+;
+; POWER10-BE-LABEL: shiftright128_v8i16:
+; POWER10-BE:       # %bb.0: # %entry
+; POWER10-BE-NEXT:    mtvsrwz v3, r3
+; POWER10-BE-NEXT:    vspltb v3, v3, 7
+; POWER10-BE-NEXT:    vsrq v2, v2, v3
+; POWER10-BE-NEXT:    blr
+;
+; POWER3210-BE-LABEL: shiftright128_v8i16:
+; POWER3210-BE:       # %bb.0: # %entry
+; POWER3210-BE-NEXT:    mtvsrwz v3, r3
+; POWER3210-BE-NEXT:    vspltb v3, v3, 7
+; POWER3210-BE-NEXT:    vsrq v2, v2, v3
+; POWER3210-BE-NEXT:    blr
+;
+; POWER9-LE-LABEL: shiftright128_v8i16:
+; POWER9-LE:       # %bb.0: # %entry
+; POWER9-LE-NEXT:    mtvsrd v3, r5
+; POWER9-LE-NEXT:    vspltb v3, v3, 7
+; POWER9-LE-NEXT:    vsro v2, v2, v3
+; POWER9-LE-NEXT:    vsr v2, v2, v3
+; POWER9-LE-NEXT:    blr
+;
+; POWER9-BE-LABEL: shiftright128_v8i16:
+; POWER9-BE:       # %bb.0: # %entry
+; POWER9-BE-NEXT:    mtvsrwz v3, r3
+; POWER9-BE-NEXT:    vspltb v3, v3, 7
+; POWER9-BE-NEXT:    vsro v2, v2, v3
+; POWER9-BE-NEXT:    vsr v2, v2, v3
+; POWER9-BE-NEXT:    blr
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+  %0 = bitcast <8 x i16> %in to <4 x i32>
+  %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+  %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+  %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+  %4 = bitcast <4 x i32> %3 to <8 x i16>
+  ret <8 x i16> %4
+}
+
+; Test case with different vectors (should not optimize - different shift amounts)
+define <16 x i8> @no_optimization_different_shifts(<16 x i8> %in, i8 zeroext %sh1, i8 zeroext %sh2)  {
+; POWER10-LE-LABEL: no_optimization_different_shifts:
+; POWER10-LE:       # %bb.0: # %entry
+; POWER10-LE-NEXT:    mtvsrd v3, r5
+; POWER10-LE-NEXT:    mtvsrd v4, r6
+; POWER10-LE-NEXT:    vspltb v3, v3, 7
+; POWER10-LE-NEXT:    vspltb v4, v4, 7
+; POWER10-LE-NEXT:    vsro v2, v2, v3
+; POWER10-LE-NEXT:    vsr v2, v2, v4
+; POWER10-LE-NEXT:    blr
+;
+; POWER10-BE-LABEL: no_optimization_different_shifts:
+; POWER10-BE:       # %bb.0: # %entry
+; POWER10-BE-NEXT:    mtvsrwz v3, r3
+; POWER10-BE-NEXT:    mtvsrwz v4, r4
+; POWER10-BE-NEXT:    vspltb v3, v3, 7
+; POWER10-BE-NEXT:    vspltb v4, v4, 7
+; POWER10-BE-NEXT:    vsro v2, v2, v3
+; POWER10-BE-NEXT:    vsr v2, v2, v4
+; POWER10-BE-NEXT:    blr
+;
+; POWER3210-BE-LABEL: no_optimization_different_shifts:
+; POWER3210-BE:       # %bb.0: # %entry
+; POWER3210-BE-NEXT:    mtvsrwz v3, r3
+; POWER3210-BE-NEXT:    mtvsrwz v4, r4
+; POWER3210-BE-NEXT:    vspltb v3, v3, 7
+; POWER3210-BE-NEXT:    vspltb v4, v4, 7
+; POWER3210-BE-NEXT:    vsro v2, v2, v3
+; POWER3210-BE-NEXT:    vsr v2, v2, v4
+; POWER3210-BE-NEXT:    blr
+;
+; POWER9-LE-LABEL: no_optimization_different_shifts:
+; POWER9-LE:       # %bb.0: # %entry
+; POWER9-LE-NEXT:    mtvsrd v3, r5
+; POWER9-LE-NEXT:    mtvsrd v4, r6
+; POWER9-LE-NEXT:    vspltb v3, v3, 7
+; POWER9-LE-NEXT:    vspltb v4, v4, 7
+; POWER9-LE-NEXT:    vsro v2, v2, v3
+; POWER9-LE-NEXT:    vsr v2, v2, v4
+; POWER9-LE-NEXT:    blr
+;
+; POWER9-BE-LABEL: no_optimization_different_shifts:
+; POWER9-BE:       # %bb.0: # %entry
+; POWER9-BE-NEXT:    mtvsrwz v3, r3
+; POWER9-BE-NEXT:    mtvsrwz v4, r4
+; POWER9-BE-NEXT:    vspltb v3, v3, 7
+; POWER9-BE-NEXT:    vspltb v4, v4, 7
+; POWER9-BE-NEXT:    vsro v2, v2, v3
+; POWER9-BE-NEXT:    vsr v2, v2, v4
+; POWER9-BE-NEXT:    blr
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh1, i64 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+  %splat.splatinsert.i2 = insertelement <16 x i8> poison, i8 %sh2, i64 0
+  %splat.splat.i2 = shufflevector <16 x i8> %splat.splatinsert.i2, <16 x i8> poison, <16 x i32> zeroinitializer
+  %0 = bitcast <16 x i8> %in to <4 x i32>
+  %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+  %2 = bitcast <16 x i8> %splat.splat.i2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+  %4 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %3, <4 x i32> %2)
+  %5 = bitcast <4 x i32> %4 to <16 x i8>
+  ret <16 x i8> %5
+}
+
+; Test case with multiple uses of VSRO result (should not optimize)
+define <16 x i8> @no_optimization_multiple_uses(<16 x i8> %in, i8 zeroext %sh)  {
+; POWER10-LE-LABEL: no_optimization_multiple_uses:
+; POWER10-LE:       # %bb.0: # %entry
+; POWER10-LE-NEXT:    mtvsrd v3, r5
+; POWER10-LE-NEXT:    vspltb v3, v3, 7
+; POWER10-LE-NEXT:    vsro v2, v2, v3
+; POWER10-LE-NEXT:    vsr v3, v2, v3
+; POWER10-LE-NEXT:    vaddubm v2, v2, v3
+; POWER10-LE-NEXT:    blr
+;
+; POWER10-BE-LABEL: no_optimization_multiple_uses:
+; POWER10-BE:       # %bb.0: # %entry
+; POWER10-BE-NEXT:    mtvsrwz v3, r3
+; POWER10-BE-NEXT:    vspltb v3, v3, 7
+; POWER10-BE-NEXT:    vsro v2, v2, v3
+; POWER10-BE-NEXT:    vsr v3, v2, v3
+; POWER10-BE-NEXT:    vaddubm v2, v2, v3
+; POWER10-BE-NEXT:    blr
+;
+; POWER3210-BE-LABEL: no_optimization_multiple_uses:
+; POWER3210-BE:       # %bb.0: # %entry
+; POWER3210-BE-NEXT:    mtvsrwz v3, r3
+; POWER3210-BE-NEXT:    vspltb v3, v3, 7
+; POWER3210-BE-NEXT:    vsro v2, v2, v3
+; POWER3210-BE-NEXT:    vsr v3, v2, v3
+; POWER3210-BE-NEXT:    vaddubm v2, v2, v3
+; POWER3210-BE-NEXT:    blr
+;
+; POWER9-LE-LABEL: no_optimization_multiple_uses:
+; POWER9-LE:       # %bb.0: # %entry
+; POWER9-LE-NEXT:    mtvsrd v3, r5
+; POWER9-LE-NEXT:    vspltb v3, v3, 7
+; POWER9-LE-NEXT:    vsro v2, v2, v3
+; POWER9-LE-NEXT:    vsr v3, v2, v3
+; POWER9-LE-NEXT:    vaddubm v2, v2, v3
+; POWER9-LE-NEXT:    blr
+;
+; POWER9-BE-LABEL: no_optimization_multiple_uses:
+; POWER9-BE:       # %bb.0: # %entry
+; POWER9-BE-NEXT:    mtvsrwz v3, r3
+; POWER9-BE-NEXT:    vspltb v3, v3, 7
+; POWER9-BE-NEXT:    vsro v2, v2, v3
+; POWER9-BE-NEXT:    vsr v3, v2, v3
+; POWER9-BE-NEXT:    vaddubm v2, v2, v3
+; POWER9-BE-NEXT:    blr
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+  %0 = bitcast <16 x i8> %in to <4 x i32>
+  %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+  %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+  %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+  %4 = bitcast <4 x i32> %3 to <16 x i8>
+  %5 = bitcast <4 x i32> %2 to <16 x i8>
+  %6 = add <16 x i8> %5, %4
+  ret <16 x i8> %6
+}

>From b0b23c53ffffc7e0febdece8d6bdfcb0220a28b3 Mon Sep 17 00:00:00 2001
From: Tony Varghese <tony.varghese at ibm.com>
Date: Wed, 20 Aug 2025 06:05:33 +0000
Subject: [PATCH 2/4] Fixed the comments for better understanding

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 41a9a5d827478..dde9acb33a6de 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16682,16 +16682,19 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
   llvm_unreachable("Expected a load or store node here");
 }
 
-// Combine VSR(VSRO input, shift), shift) to VSRQ(input, shift)
+// Combine VSR(VSRO (input, vsro_byte_shift), vsr_bit_shift) to VSRQ(input,
+// vsrq_bit_shift) where vsrq_bit_shift = (vsro_byte_shift * 8) + vsr_bit_shift
 //
 // PowerPC Vector Shift Instructions:
 // - vsro (Vector Shift Right by Octet): Shifts vector right by N bytes,
 //   where N is specified in bits 121:124 of the shift vector (4 bits, 0-15
-//   bytes)
+//   bytes shift)
 // - vsr (Vector Shift Right): Shifts vector right by N bits,
-//   where N is specified in bits 125:127 of the shift vector (3 bits, 0-7 bits)
+//   where N is specified in bits 125:127 of the shift vector (3 bits, 0-7 bits
+//   shift)
 // - vsrq (Vector Shift Right Quadword): Shifts vector right by N bits,
-//   where N is specified in bits 57:63 of the shift vector (7 bits, 0-127 bits)
+//   where N is specified in low-order 7 bits of the shift vector (7 bits, 0-127
+//   bits shift)
 //
 // Input DAG pattern: vsr(vsro(input, shift_vector), shift_vector)
 // performs the following shifts:
@@ -16701,11 +16704,11 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
 //   Total shift = (bits[121:124] * 8) + bits[125:127] bits
 //
 // Since bits 121:127 form a 7-bit value representing the total shift amount,
-// and vsrq uses the same 7-bit shift amount (assuming bits 57:63 map to
-// 121:127), we can replace the two-instruction sequence with a single vsrq
-// instruction.
+// and vsrq uses the same 7-bit shift amount, replace the two-instruction
+// sequence with a single vsrq instruction.
 //
-// Optimization: vsr(vsro(input, shift), shift) -> vsrq(input, shift)
+// Input DAG      : vsr(vsro(input, vsro_byte_shift), vsr_bit_shift)
+// Optimized DAG  : vsrq(input, vsrq_bit_shift)
 SDValue PPCTargetLowering::combineVSROVSRToVSRQ(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
 

>From 8571e675d88a4946bb18ea0760eb6e914b7a0bcf Mon Sep 17 00:00:00 2001
From: Tony Varghese <tony.varghese at ibm.com>
Date: Wed, 20 Aug 2025 09:25:39 -0500
Subject: [PATCH 3/4] Using PatFrag to simplify the pattern matching

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 68 -------------------
 llvm/lib/Target/PowerPC/PPCInstrAltivec.td    |  7 ++
 llvm/lib/Target/PowerPC/PPCInstrP10.td        |  3 +
 .../PowerPC/vsro-vsr-vsrq-dag-combine.ll      |  2 +-
 4 files changed, 11 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index dde9acb33a6de..595ef9eb9b98f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16682,70 +16682,6 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
   llvm_unreachable("Expected a load or store node here");
 }
 
-// Combine VSR(VSRO (input, vsro_byte_shift), vsr_bit_shift) to VSRQ(input,
-// vsrq_bit_shift) where vsrq_bit_shift = (vsro_byte_shift * 8) + vsr_bit_shift
-//
-// PowerPC Vector Shift Instructions:
-// - vsro (Vector Shift Right by Octet): Shifts vector right by N bytes,
-//   where N is specified in bits 121:124 of the shift vector (4 bits, 0-15
-//   bytes shift)
-// - vsr (Vector Shift Right): Shifts vector right by N bits,
-//   where N is specified in bits 125:127 of the shift vector (3 bits, 0-7 bits
-//   shift)
-// - vsrq (Vector Shift Right Quadword): Shifts vector right by N bits,
-//   where N is specified in low-order 7 bits of the shift vector (7 bits, 0-127
-//   bits shift)
-//
-// Input DAG pattern: vsr(vsro(input, shift_vector), shift_vector)
-// performs the following shifts:
-//   1. vsro: input >> (bits[121:124] * 8) bits   [byte shifts converted to
-//   bits]
-//   2. vsr:  result >> bits[125:127] bits        [additional bit shifts]
-//   Total shift = (bits[121:124] * 8) + bits[125:127] bits
-//
-// Since bits 121:127 form a 7-bit value representing the total shift amount,
-// and vsrq uses the same 7-bit shift amount, replace the two-instruction
-// sequence with a single vsrq instruction.
-//
-// Input DAG      : vsr(vsro(input, vsro_byte_shift), vsr_bit_shift)
-// Optimized DAG  : vsrq(input, vsrq_bit_shift)
-SDValue PPCTargetLowering::combineVSROVSRToVSRQ(SDNode *N,
-                                                DAGCombinerInfo &DCI) const {
-
-  // Only available on ISA 3.1+ (Power10+)
-  if (!Subtarget.isISA3_1())
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue VSRInput = N->getOperand(1);
-  SDValue VSRShift = N->getOperand(2);
-
-  // Check if VSR input comes from a VSRO intrinsic
-  if (VSRInput.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
-    return SDValue();
-
-  unsigned VSROIntrinsicID = VSRInput->getConstantOperandVal(0);
-  if (VSROIntrinsicID != Intrinsic::ppc_altivec_vsro)
-    return SDValue();
-
-  // Check if VSRO uses the same shift amount register as VSR
-  SDValue VSROShift = VSRInput.getOperand(2);
-  if (VSRShift != VSROShift)
-    return SDValue();
-
-  // Check single use - VSRO result should only be used by this VSR
-  if (!VSRInput.hasOneUse())
-    return SDValue();
-
-  // Get the original input to VSRO instruction
-  SDValue VSROOrigInput = VSRInput.getOperand(1);
-
-  return DAG.getNode(PPCISD::VSRQ, SDLoc(N),
-                     N->getValueType(0), // Preserve original result type
-                     VSROOrigInput,      // Original input vector
-                     VSRShift);          // Shift amount
-}
-
 static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
   unsigned IntrinsicID = Intrin.getConstantOperandVal(1);
   if (IntrinsicID == Intrinsic::ppc_stdcx)
@@ -17273,10 +17209,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
           }
         }
       }
-
-      // combine VSRO + VSR intrinsic calls to optimize with VSRQ
-      if (IID == Intrinsic::ppc_altivec_vsr)
-        return combineVSROVSRToVSRQ(N, DCI);
     }
 
     break;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 79fe12e8e4b49..04efd99b5472c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -261,6 +261,13 @@ def immEQOneV : PatLeaf<(build_vector), [{
     return C->isOne();
   return false;
 }]>;
+
+def VSRVSRO : PatFrag<(ops node:$input, node:$shift), 
+                      (int_ppc_altivec_vsr 
+                        (int_ppc_altivec_vsro node:$input, node:$shift), 
+                        node:$shift), 
+                      [{ return N->getOperand(1).hasOneUse(); }]>;
+
 //===----------------------------------------------------------------------===//
 // Helpers for defining instructions that directly correspond to intrinsics.
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 902c40544ac28..d65aab9b654f8 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2055,6 +2055,9 @@ let Predicates = [IsISA3_1, HasFPU] in {
 
 //---------------------------- Anonymous Patterns ----------------------------//
 let Predicates = [IsISA3_1] in {
+  // Exploit vsrq instruction to optimize VSR(VSRO (input, vsro_byte_shift), vsr_bit_shift)
+  // to VSRQ(input, vsrq_bit_shift)
+  def : Pat<(VSRVSRO v4i32:$vA, v4i32:$vB), (VSRQ $vA, $vB)>;
   // Exploit the vector multiply high instructions using intrinsics.
   def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)),
             (v4i32 (VMULHSW $vA, $vB))>;
diff --git a/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
index c2599c8f6af13..afbdae6dfa09a 100644
--- a/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
+++ b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
@@ -212,7 +212,7 @@ entry:
   ret <8 x i16> %4
 }
 
-; Test case with different vectors (should not optimize - different shift amounts)
+; Test case with different vectors (should not optimize - different shift amount registers)
 define <16 x i8> @no_optimization_different_shifts(<16 x i8> %in, i8 zeroext %sh1, i8 zeroext %sh2)  {
 ; POWER10-LE-LABEL: no_optimization_different_shifts:
 ; POWER10-LE:       # %bb.0: # %entry

>From b737004e1d7268144239e779da9d93d362b80469 Mon Sep 17 00:00:00 2001
From: Tony Varghese <tony.varghese at ibm.com>
Date: Wed, 20 Aug 2025 09:25:39 -0500
Subject: [PATCH 4/4] Using PatFrag to simplify the pattern matching

---
 llvm/lib/Target/PowerPC/PPCISelLowering.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 362ccdfb26efa..6009d99a50520 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1450,7 +1450,6 @@ namespace llvm {
                                  SelectionDAG &DAG) const;
     SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
                                  DAGCombinerInfo &DCI) const;
-    SDValue combineVSROVSRToVSRQ(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it