[llvm] 60ee515 - [PowerPC] Emit lxvkq and vsrq instructions for build vector patterns (#157625)

Tue Oct 14 22:24:08 PDT 2025

Author: Tony Varghese
Date: 2025-10-15T10:54:04+05:30
New Revision: 60ee515b8cc2f14d548d7e0a20f44434a237f22b

URL: https://github.com/llvm/llvm-project/commit/60ee515b8cc2f14d548d7e0a20f44434a237f22b
DIFF: https://github.com/llvm/llvm-project/commit/60ee515b8cc2f14d548d7e0a20f44434a237f22b.diff

LOG: [PowerPC] Emit lxvkq and vsrq instructions for build vector patterns (#157625)

### Optimize BUILD_VECTOR having special quadword patterns

This change optimizes `BUILD_VECTOR` operations by using the `lxvkq` or
`xxpltib + vsrq` instructions to inline constants matching specific
128-bit patterns:
- **MSB set pattern**: `0x8000_0000_0000_0000_0000_0000_0000_0000`
- **LSB set pattern**: `0x0000_0000_0000_0000_0000_0000_0000_0001`

### Implementation Details

The `lxvkq` instruction loads special quadword values into VSX
registers:
```asm
lxvkq XT, UIM
# When UIM=16: loads 0x8000_0000_0000_0000_0000_0000_0000_0000
```

The optimization reconstructs the 128-bit register pattern from
`BUILD_VECTOR` operands, accounting for target endianness. For example,
the MSB pattern can be represented as:
- **Big-Endian**: `<i64 -9223372036854775808, i64 0>`
- **Little-Endian**: `<i64 0, i64 -9223372036854775808>`

Both produce the same register value:
`0x8000_0000_0000_0000_0000_0000_0000_0000`

### MSB Pattern (`0x8000...0000`)
All vector types (`v2i64`, `v4i32`, `v8i16`, `v16i8`) generate:
```asm
lxvkq v2, 16
```

### LSB Pattern (`0x0000...0001`)
All vector types generate:
```asm
xxspltib v2, 255
vsrq v2, v2, v2
```

---------

Co-authored-by: Tony Varghese <tony.varghese at ibm.com>

Added: 
    llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll

Modified: 
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/test/CodeGen/PowerPC/vector-reduce-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 944a1e2e6fa17..8bf0d118da575 100644

--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9702,6 +9702,10 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       }
       return SDV;
     }
+    // Recognize build vector patterns to emit VSX vector instructions
+    // instead of loading value from memory.
+    if (SDValue VecPat = combineBVLoadsSpecialValue(Op, DAG))
+      return VecPat;
   }
   // Check if this is a splat of a constant value.
   APInt APSplatBits, APSplatUndef;
@@ -15696,6 +15700,142 @@ combineElementTruncationToVectorTruncation(SDNode *N,
   return SDValue();
 }
 
+// LXVKQ instruction load VSX vector with a special quadword value
+// based on an immediate value. This helper method returns the details of the
+// match as a tuple of {LXVKQ unsigned IMM Value, right_shift_amount}
+// to help generate the LXVKQ instruction and the subsequent shift instruction
+// required to match the original build vector pattern.
+
+// LXVKQPattern: {LXVKQ unsigned IMM Value, right_shift_amount}
+using LXVKQPattern = std::tuple<uint32_t, uint8_t>;
+
+static std::optional<LXVKQPattern> getPatternInfo(const APInt &FullVal) {
+
+  // LXVKQ instruction loads the Quadword value:
+  // 0x8000_0000_0000_0000_0000_0000_0000_0000 when imm = 0b10000
+  static const APInt BasePattern = APInt(128, 0x8000000000000000ULL) << 64;
+  static const uint32_t Uim = 16;
+
+  // Check for direct LXVKQ match (no shift needed)
+  if (FullVal == BasePattern)
+    return std::make_tuple(Uim, uint8_t{0});
+
+  // Check if FullValue is 1 (the result of the base pattern >> 127)
+  if (FullVal == APInt(128, 1))
+    return std::make_tuple(Uim, uint8_t{127});
+
+  return std::nullopt;
+}
+
+/// Combine vector loads to a single load (using lxvkq) or splat with shift of a
+/// constant (xxspltib + vsrq) by recognising patterns in the Build Vector.
+/// LXVKQ instruction load VSX vector with a special quadword value based on an
+/// immediate value. if UIM=0b10000 then LXVKQ loads VSR[32×TX+T] with value
+/// 0x8000_0000_0000_0000_0000_0000_0000_0000.
+/// This can be used to inline the build vector constants that have the
+/// following patterns:
+///
+/// 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
+/// 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
+/// MSB pattern can directly loaded using LXVKQ while LSB is loaded using a
+/// combination of splatting and right shift instructions.
+
+SDValue PPCTargetLowering::combineBVLoadsSpecialValue(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+
+  assert((Op.getNode() && Op.getOpcode() == ISD::BUILD_VECTOR) &&
+         "Expected a BuildVectorSDNode in combineBVLoadsSpecialValue");
+
+  // This transformation is only supported if we are loading either a byte,
+  // halfword, word, or doubleword.
+  EVT VT = Op.getValueType();
+  if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
+        VT == MVT::v2i64))
+    return SDValue();
+
+  LLVM_DEBUG(llvm::dbgs() << "\ncombineBVLoadsSpecialValue: Build vector ("
+                          << VT.getEVTString() << "): ";
+             Op->dump());
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned ElemBits = VT.getScalarSizeInBits();
+
+  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
+
+  // Check for Non-constant operand in the build vector.
+  for (const SDValue &Operand : Op.getNode()->op_values()) {
+    if (!isa<ConstantSDNode>(Operand))
+      return SDValue();
+  }
+
+  // Assemble build vector operands as a 128-bit register value
+  // We need to reconstruct what the 128-bit register pattern would be
+  // that produces this vector when interpreted with the current endianness
+  APInt FullVal = APInt::getZero(128);
+
+  for (unsigned Index = 0; Index < NumElems; ++Index) {
+    auto *C = cast<ConstantSDNode>(Op.getOperand(Index));
+
+    // Get element value as raw bits (zero-extended)
+    uint64_t ElemValue = C->getZExtValue();
+
+    // Mask to element size to ensure we only get the relevant bits
+    if (ElemBits < 64)
+      ElemValue &= ((1ULL << ElemBits) - 1);
+
+    // Calculate bit position for this element in the 128-bit register
+    unsigned BitPos =
+        (IsLittleEndian) ? (Index * ElemBits) : (128 - (Index + 1) * ElemBits);
+
+    // Create APInt for the element value and shift it to correct position
+    APInt ElemAPInt(128, ElemValue);
+    ElemAPInt <<= BitPos;
+
+    // Place the element value at the correct bit position
+    FullVal |= ElemAPInt;
+  }
+
+  if (FullVal.isZero() || FullVal.isAllOnes())
+    return SDValue();
+
+  if (auto UIMOpt = getPatternInfo(FullVal)) {
+    const auto &[Uim, ShiftAmount] = *UIMOpt;
+    SDLoc Dl(Op);
+
+    // Generate LXVKQ instruction if the shift amount is zero.
+    if (ShiftAmount == 0) {
+      SDValue UimVal = DAG.getTargetConstant(Uim, Dl, MVT::i32);
+      SDValue LxvkqInstr =
+          SDValue(DAG.getMachineNode(PPC::LXVKQ, Dl, VT, UimVal), 0);
+      LLVM_DEBUG(llvm::dbgs()
+                     << "combineBVLoadsSpecialValue: Instruction Emitted ";
+                 LxvkqInstr.dump());
+      return LxvkqInstr;
+    }
+
+    assert(ShiftAmount == 127 && "Unexpected lxvkq shift amount value");
+
+    // The right shifted pattern can be constructed using a combination of
+    // XXSPLTIB and VSRQ instruction. VSRQ uses the shift amount from the lower
+    // 7 bits of byte 15. This can be specified using XXSPLTIB with immediate
+    // value 255.
+    SDValue ShiftAmountVec =
+        SDValue(DAG.getMachineNode(PPC::XXSPLTIB, Dl, MVT::v4i32,
+                                   DAG.getTargetConstant(255, Dl, MVT::i32)),
+                0);
+    // Generate appropriate right shift instruction
+    SDValue ShiftVec = SDValue(
+        DAG.getMachineNode(PPC::VSRQ, Dl, VT, ShiftAmountVec, ShiftAmountVec),
+        0);
+    LLVM_DEBUG(llvm::dbgs()
+                   << "\n combineBVLoadsSpecialValue: Instruction Emitted ";
+               ShiftVec.dump());
+    return ShiftVec;
+  }
+  // No patterns matched for build vectors.
+  return SDValue();
+}
+
 /// Reduce the number of loads when building a vector.
 ///
 /// Building a vector out of multiple loads can be converted to a load

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 59f338782ba43..880aca751d7d6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1472,6 +1472,9 @@ namespace llvm {
     combineElementTruncationToVectorTruncation(SDNode *N,
                                                DAGCombinerInfo &DCI) const;
 
+    SDValue combineBVLoadsSpecialValue(SDValue Operand,
+                                       SelectionDAG &DAG) const;
+
     /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
     /// handled by the VINSERTH instruction introduced in ISA 3.0. This is
     /// essentially any shuffle of v8i16 vectors that just inserts one element

diff  --git a/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll b/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll
new file mode 100644
index 0000000000000..0ee4524a6c68a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC64-LE-10
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC64-BE-10
+
+; Test LXVKQ instruction generation for special vector constants matching 128 bit patterns:
+; 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
+; 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
+
+; =============================================================================
+; v2i64 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-9223372036854775808, 0>
+define dso_local noundef <2 x i64> @test_v2i64_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI0_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 -9223372036854775808, i64 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, -9223372036854775808>
+define dso_local noundef <2 x i64> @test_v2i64_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI1_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI1_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 0, i64 -9223372036854775808>
+}
+
+; =============================================================================
+; v4i32 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-2147483648, 0, 0, 0>
+define dso_local noundef <4 x i32> @test_v4i32_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI2_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 -2147483648, i32 0, i32 0, i32 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, -2147483648>
+define dso_local noundef <4 x i32> @test_v4i32_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI3_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI3_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 0, i32 0, i32 0, i32 -2147483648>
+}
+
+; =============================================================================
+; v8i16 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-32768, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <8 x i16> @test_v8i16_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI4_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 -32768, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, 0, 0, 0, 0, -32768>
+define dso_local noundef <8 x i16> @test_v8i16_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI5_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI5_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 -32768>
+}
+
+; =============================================================================
+; v16i8 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <16 x i8> @test_v16i8_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI6_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 -128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128>
+define dso_local noundef <16 x i8> @test_v16i8_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI7_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI7_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -128>
+}
+
+; =============================================================================
+; v2i64 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 1>
+define dso_local noundef <2 x i64> @test_v2i64_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI8_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 0, i64 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0>
+define dso_local noundef <2 x i64> @test_v2i64_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI9_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI9_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 1, i64 0>
+}
+
+; =============================================================================
+; v4i32 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 1>
+define dso_local noundef <4 x i32> @test_v4i32_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI10_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0>
+define dso_local noundef <4 x i32> @test_v4i32_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI11_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI11_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+}
+
+; =============================================================================
+; v8i16 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 0, 0, 0, 0, 1>
+define dso_local noundef <8 x i16> @test_v8i16_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI12_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <8 x i16> @test_v8i16_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI13_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI13_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 1, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+; =============================================================================
+; v16i8 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>
+define dso_local noundef <16 x i8> @test_v16i8_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI14_0 at PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <16 x i8> @test_v16i8_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI15_0 at toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI15_0 at toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
\ No newline at end of file

diff  --git a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
index 0892210fc7443..d506d2062be5c 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
@@ -1566,12 +1566,16 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
 ; PWR10BE-LABEL: v16i8tov16i64_sign:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_0 at toc@ha
+; PWR10BE-NEXT:    xxspltib v1, 255
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_0 at toc@l
+; PWR10BE-NEXT:    vsrq v1, v1, v1
 ; PWR10BE-NEXT:    lxv v3, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_1 at toc@ha
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_1 at toc@l
+; PWR10BE-NEXT:    vperm v1, v2, v2, v1
 ; PWR10BE-NEXT:    lxv v4, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_2 at toc@ha
+; PWR10BE-NEXT:    vextsb2d v1, v1
 ; PWR10BE-NEXT:    vperm v3, v2, v2, v3
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_2 at toc@l
 ; PWR10BE-NEXT:    vextsb2d v3, v3
@@ -1585,23 +1589,18 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    vperm v5, v2, v2, v5
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_4 at toc@l
 ; PWR10BE-NEXT:    vextsb2d v5, v5
-; PWR10BE-NEXT:    lxv v1, 0(r3)
+; PWR10BE-NEXT:    lxv v6, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_5 at toc@ha
 ; PWR10BE-NEXT:    vperm v0, v2, v2, v0
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_5 at toc@l
 ; PWR10BE-NEXT:    vextsb2d v0, v0
-; PWR10BE-NEXT:    lxv v6, 0(r3)
+; PWR10BE-NEXT:    lxv v7, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_6 at toc@ha
-; PWR10BE-NEXT:    vperm v1, v2, v2, v1
+; PWR10BE-NEXT:    vperm v6, v2, v2, v6
 ; PWR10BE-NEXT:    vaddudm v5, v0, v5
 ; PWR10BE-NEXT:    vaddudm v3, v4, v3
 ; PWR10BE-NEXT:    vaddudm v3, v3, v5
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_6 at toc@l
-; PWR10BE-NEXT:    vextsb2d v1, v1
-; PWR10BE-NEXT:    lxv v7, 0(r3)
-; PWR10BE-NEXT:    addis r3, r2, .LCPI23_7 at toc@ha
-; PWR10BE-NEXT:    vperm v6, v2, v2, v6
-; PWR10BE-NEXT:    addi r3, r3, .LCPI23_7 at toc@l
 ; PWR10BE-NEXT:    vextsb2d v6, v6
 ; PWR10BE-NEXT:    lxv v8, 0(r3)
 ; PWR10BE-NEXT:    vperm v7, v2, v2, v7
@@ -1609,7 +1608,7 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    vperm v2, v2, v2, v8
 ; PWR10BE-NEXT:    vextsb2d v2, v2
 ; PWR10BE-NEXT:    vaddudm v2, v2, v7
-; PWR10BE-NEXT:    vaddudm v4, v6, v1
+; PWR10BE-NEXT:    vaddudm v4, v1, v6
 ; PWR10BE-NEXT:    vaddudm v2, v4, v2
 ; PWR10BE-NEXT:    vaddudm v2, v2, v3
 ; PWR10BE-NEXT:    xxswapd v3, v2