[llvm] [PowerPC] Optimize bitcast(truncate) patterns using vbpermq/vbpermd (PR #181233)
Maryam Moghadas via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 12:03:32 PDT 2026
https://github.com/maryammo updated https://github.com/llvm/llvm-project/pull/181233
>From 3250806e123b959caf2a7b02b7b26b75ca747495 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo at ca.ibm.com>
Date: Wed, 11 Feb 2026 17:41:44 +0000
Subject: [PATCH 1/6] PowerPC] Implement v256i1 BUILD_VECTOR lowering
Pack 256 i1 operands into four i64, then construct two v2i64 vectors,
and combine them using PPCISD::PAIR_BUILD for v256i1.
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 35 +++++
.../PowerPC/mma-build-vector-v256i1.ll | 135 ++++++++++++++++++
2 files changed, 170 insertions(+)
create mode 100644 llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 753306a8d365b..fd39c71247af8 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1382,6 +1382,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);
setOperationAction(ISD::LOAD, MVT::v256i1, Custom);
setOperationAction(ISD::STORE, MVT::v256i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v256i1, Custom);
}
if (Subtarget.hasMMA()) {
if (Subtarget.isISAFuture()) {
@@ -9546,6 +9547,40 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+ EVT VT1 = Op.getValueType();
+ if (VT1 == MVT::v256i1) {
+ assert(Subtarget.pairedVectorMemops() &&
+ "v256i1 requires paired vector support");
+
+ // Group 256 individual i1 bits into 4 i64 scalars
+ SmallVector<SDValue, 4> Vals;
+
+ for (unsigned i = 0; i < 4; ++i) {
+ SDValue Val = DAG.getConstant(0, dl, MVT::i64);
+ for (unsigned j = 0; j < 64; ++j) {
+ SDValue Elt = Op.getOperand(i * 64 + j);
+ if (Elt.isUndef())
+ continue;
+ SDValue Bit = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Elt);
+ if (j > 0)
+ Bit = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit,
+ DAG.getConstant(j, dl, MVT::i64));
+ Val = DAG.getNode(ISD::OR, dl, MVT::i64, Val, Bit);
+ }
+ Vals.push_back(Val);
+ }
+
+ SDValue Low = DAG.getBuildVector(MVT::v2i64, dl, {Vals[0], Vals[1]});
+ SDValue High = DAG.getBuildVector(MVT::v2i64, dl, {Vals[2], Vals[3]});
+
+ Low = DAG.getBitcast(MVT::v4i32, Low);
+ High = DAG.getBitcast(MVT::v4i32, High);
+
+ return DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1,
+ Subtarget.isLittleEndian() ? High : Low,
+ Subtarget.isLittleEndian() ? Low : High);
+ }
+
if (Subtarget.hasP10Vector()) {
APInt BitMask(32, 0);
// If the value of the vector is all zeros or all ones,
diff --git a/llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll b/llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll
new file mode 100644
index 0000000000000..fd6a5d93784ea
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=powerpc64le -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-LE
+; RUN: llc -mtriple=powerpc64 -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-BE
+
+define fastcc <16 x i16> @test(<16 x i1> %0) {
+; CHECK-LE-LABEL: test:
+; CHECK-LE: # %bb.0: # %Entry
+; CHECK-LE-NEXT: clrldi 0, 1, 59
+; CHECK-LE-NEXT: std 30, -16(1)
+; CHECK-LE-NEXT: mr 30, 1
+; CHECK-LE-NEXT: subfic 0, 0, -96
+; CHECK-LE-NEXT: stdux 1, 1, 0
+; CHECK-LE-NEXT: .cfi_def_cfa_register r30
+; CHECK-LE-NEXT: .cfi_offset r30, -16
+; CHECK-LE-NEXT: li 3, 1
+; CHECK-LE-NEXT: xxlxor 35, 35, 35
+; CHECK-LE-NEXT: vextubrx 4, 3, 2
+; CHECK-LE-NEXT: li 3, 0
+; CHECK-LE-NEXT: vextubrx 3, 3, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 63
+; CHECK-LE-NEXT: rlwimi 3, 4, 1, 30, 30
+; CHECK-LE-NEXT: li 4, 2
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 2, 29, 29
+; CHECK-LE-NEXT: li 4, 3
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 3, 28, 28
+; CHECK-LE-NEXT: li 4, 4
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 4, 27, 27
+; CHECK-LE-NEXT: li 4, 5
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 5, 26, 26
+; CHECK-LE-NEXT: li 4, 6
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 6, 25, 25
+; CHECK-LE-NEXT: li 4, 7
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 7, 24, 24
+; CHECK-LE-NEXT: li 4, 8
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 8, 23, 23
+; CHECK-LE-NEXT: li 4, 9
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 9, 22, 22
+; CHECK-LE-NEXT: li 4, 10
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 10, 21, 21
+; CHECK-LE-NEXT: li 4, 11
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 11, 20, 20
+; CHECK-LE-NEXT: li 4, 12
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 12, 19, 19
+; CHECK-LE-NEXT: li 4, 13
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 13, 18, 18
+; CHECK-LE-NEXT: li 4, 14
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 14, 17, 17
+; CHECK-LE-NEXT: li 4, 15
+; CHECK-LE-NEXT: vextubrx 4, 4, 2
+; CHECK-LE-NEXT: rlwimi 3, 4, 15, 16, 16
+; CHECK-LE-NEXT: mtvsrdd 34, 0, 3
+; CHECK-LE-NEXT: mr 1, 30
+; CHECK-LE-NEXT: ld 30, -16(1)
+; CHECK-LE-NEXT: blr
+;
+; CHECK-BE-LABEL: test:
+; CHECK-BE: # %bb.0: # %Entry
+; CHECK-BE-NEXT: clrldi 0, 1, 59
+; CHECK-BE-NEXT: std 30, -16(1)
+; CHECK-BE-NEXT: mr 30, 1
+; CHECK-BE-NEXT: subfic 0, 0, -128
+; CHECK-BE-NEXT: stdux 1, 1, 0
+; CHECK-BE-NEXT: .cfi_def_cfa_register r30
+; CHECK-BE-NEXT: .cfi_offset r30, -16
+; CHECK-BE-NEXT: li 3, 1
+; CHECK-BE-NEXT: xxlxor 35, 35, 35
+; CHECK-BE-NEXT: vextublx 5, 3, 2
+; CHECK-BE-NEXT: li 3, 0
+; CHECK-BE-NEXT: vextublx 4, 3, 2
+; CHECK-BE-NEXT: clrldi 4, 4, 63
+; CHECK-BE-NEXT: rlwimi 4, 5, 1, 30, 30
+; CHECK-BE-NEXT: li 5, 2
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 2, 29, 29
+; CHECK-BE-NEXT: li 5, 3
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 3, 28, 28
+; CHECK-BE-NEXT: li 5, 4
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 4, 27, 27
+; CHECK-BE-NEXT: li 5, 5
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 5, 26, 26
+; CHECK-BE-NEXT: li 5, 6
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 6, 25, 25
+; CHECK-BE-NEXT: li 5, 7
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 7, 24, 24
+; CHECK-BE-NEXT: li 5, 8
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 8, 23, 23
+; CHECK-BE-NEXT: li 5, 9
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 9, 22, 22
+; CHECK-BE-NEXT: li 5, 10
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 10, 21, 21
+; CHECK-BE-NEXT: li 5, 11
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 11, 20, 20
+; CHECK-BE-NEXT: li 5, 12
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 12, 19, 19
+; CHECK-BE-NEXT: li 5, 13
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 13, 18, 18
+; CHECK-BE-NEXT: li 5, 14
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 14, 17, 17
+; CHECK-BE-NEXT: li 5, 15
+; CHECK-BE-NEXT: vextublx 5, 5, 2
+; CHECK-BE-NEXT: rlwimi 4, 5, 15, 16, 16
+; CHECK-BE-NEXT: mtvsrdd 34, 4, 3
+; CHECK-BE-NEXT: mr 1, 30
+; CHECK-BE-NEXT: ld 30, -16(1)
+; CHECK-BE-NEXT: blr
+Entry:
+ %1 = bitcast <16 x i1> %0 to i16
+ %2 = insertelement <16 x i16> zeroinitializer, i16 %1, i64 0
+ ret <16 x i16> %2
+}
>From b6e6fd773833a54f49b0d184dc201d123b603429 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo at ca.ibm.com>
Date: Wed, 4 Mar 2026 22:44:16 +0000
Subject: [PATCH 2/6] [PowerPC] Optimize bitcast(truncate) patterns using
vbpermq/vbpermd
Use vbpermq and vbpermd to efficiently pack i1 vector bits into scalar
integers, avoiding stack operations during type legalization.
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 122 +++++++++++-----
llvm/lib/Target/PowerPC/PPCISelLowering.h | 2 +
.../PowerPC/bitcast-truncate-vec-i1.ll | 99 +++++++++++++
.../PowerPC/mma-build-vector-v256i1.ll | 135 ------------------
4 files changed, 188 insertions(+), 170 deletions(-)
create mode 100644 llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
delete mode 100644 llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index fd39c71247af8..22da7a994d482 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1382,7 +1382,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);
setOperationAction(ISD::LOAD, MVT::v256i1, Custom);
setOperationAction(ISD::STORE, MVT::v256i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v256i1, Custom);
}
if (Subtarget.hasMMA()) {
if (Subtarget.isISAFuture()) {
@@ -1456,6 +1455,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
}
+ if (Subtarget.hasP8Vector())
+ setTargetDAGCombine(ISD::BITCAST);
+
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
if (Subtarget.useCRBits()) {
@@ -9547,40 +9549,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
- EVT VT1 = Op.getValueType();
- if (VT1 == MVT::v256i1) {
- assert(Subtarget.pairedVectorMemops() &&
- "v256i1 requires paired vector support");
-
- // Group 256 individual i1 bits into 4 i64 scalars
- SmallVector<SDValue, 4> Vals;
-
- for (unsigned i = 0; i < 4; ++i) {
- SDValue Val = DAG.getConstant(0, dl, MVT::i64);
- for (unsigned j = 0; j < 64; ++j) {
- SDValue Elt = Op.getOperand(i * 64 + j);
- if (Elt.isUndef())
- continue;
- SDValue Bit = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Elt);
- if (j > 0)
- Bit = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit,
- DAG.getConstant(j, dl, MVT::i64));
- Val = DAG.getNode(ISD::OR, dl, MVT::i64, Val, Bit);
- }
- Vals.push_back(Val);
- }
-
- SDValue Low = DAG.getBuildVector(MVT::v2i64, dl, {Vals[0], Vals[1]});
- SDValue High = DAG.getBuildVector(MVT::v2i64, dl, {Vals[2], Vals[3]});
-
- Low = DAG.getBitcast(MVT::v4i32, Low);
- High = DAG.getBitcast(MVT::v4i32, High);
-
- return DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1,
- Subtarget.isLittleEndian() ? High : Low,
- Subtarget.isLittleEndian() ? Low : High);
- }
-
if (Subtarget.hasP10Vector()) {
APInt BitMask(32, 0);
// If the value of the vector is all zeros or all ones,
@@ -18261,6 +18229,81 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return DAGCombineBuildVector(N, DCI);
case PPCISD::ADDC:
return DAGCombineAddc(N, DCI);
+
+ case ISD::BITCAST: {
+ // Optimize the following patterns using vbpermq/vbpermd:
+ // i16 = bitcast(v16i1 truncate(v16i8))
+ // i8 = bitcast(v8i1 truncate(v8i16))
+ // i8 = bitcast(v8i1 truncate(v8i8))
+ SDValue Op0 = N->getOperand(0);
+ EVT ResVT = N->getValueType(0);
+ if (Op0.getOpcode() != ISD::TRUNCATE)
+ break;
+
+ SDValue Src = Op0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ bool IsV16i8 = (ResVT == MVT::i16 && SrcVT == MVT::v16i8);
+ bool IsV8i16 = (ResVT == MVT::i8 && SrcVT == MVT::v8i16);
+ bool IsV8i8 = (ResVT == MVT::i8 && SrcVT == MVT::v8i8);
+ bool IsLE = Subtarget.isLittleEndian();
+ unsigned EltIdx = IsLE ? 1 : 0;
+
+ if (IsV16i8 || IsV8i16) {
+ SDLoc dl(N);
+ int NumElts = IsV16i8 ? 16 : 8;
+ int EltSize = IsV16i8 ? 8 : 16;
+
+ SmallVector<SDValue, 16> Ops;
+ for (int i = 0; i < 16; ++i) {
+ int ByteIdx = IsLE ? (15 - i) : i;
+ int Index =
+ (ByteIdx < NumElts) ? (ByteIdx * EltSize + (EltSize - 1)) : 128;
+ Ops.push_back(DAG.getConstant(Index, dl, MVT::i8));
+ }
+
+ SDValue Indices = DAG.getBuildVector(MVT::v16i8, dl, Ops);
+ SDValue VBPerm = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
+ DAG.getConstant(Intrinsic::ppc_altivec_vbpermq, dl, MVT::i32),
+ DAG.getBitcast(MVT::v16i8, Src), Indices);
+
+ SDValue V2i64 = DAG.getBitcast(MVT::v2i64, VBPerm);
+ SDValue DW0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, V2i64,
+ DAG.getIntPtrConstant(EltIdx, dl));
+
+ if (IsV8i16)
+ DW0 = DAG.getNode(ISD::SRL, dl, MVT::i64, DW0,
+ DAG.getConstant(8, dl, MVT::i32));
+
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT, DW0);
+ }
+
+ if (IsV8i8 && Subtarget.hasP9Vector()) {
+ SDLoc dl(N);
+ SmallVector<SDValue, 16> Ops;
+ for (int i = 0; i < 16; ++i) {
+ int ByteIdx = IsLE ? (7 - i) : i;
+ int Index = (ByteIdx >= 0 && ByteIdx < 8) ? (ByteIdx * 8 + 7) : 128;
+ Ops.push_back(DAG.getConstant(Index, dl, MVT::i8));
+ }
+
+ SDValue Indices = DAG.getBuildVector(MVT::v16i8, dl, Ops);
+ SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+ SDValue V16i8Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i8,
+ Undef, Src, DAG.getIntPtrConstant(0, dl));
+
+ SDValue VBPermD = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
+ DAG.getConstant(Intrinsic::ppc_altivec_vbpermd, dl, MVT::i32),
+ V16i8Src, Indices);
+
+ SDValue DW0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ DAG.getBitcast(MVT::v2i64, VBPermD),
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT, DW0);
+ }
+ break;
+ }
}
return SDValue();
@@ -20700,3 +20743,12 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
return Subtarget.useCRBits();
}
+
+/// Targets can use this to indicate that they only support some
+/// VECTOR_SHUFFLE operations. PPC does not support shuffles on i1 element
+/// types, which are instead handled via DAG combine.
+bool PPCTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
+ if (VT.getScalarType() == MVT::i1)
+ return false;
+ return true;
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index cfcc6b5f03edc..d3f959e15ff72 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -925,6 +925,8 @@ namespace llvm {
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+ bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
+
/// getAddrModeForFlags - Based on the set of address flags, select the most
/// optimal instruction format to match by.
PPC::AddrMode getAddrModeForFlags(unsigned Flags) const;
diff --git a/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll b/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
new file mode 100644
index 0000000000000..a35fd62e9cbcd
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=powerpc64le -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-LE
+; RUN: llc -mtriple=powerpc64 -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-BE
+
+; i16 = bitcast(v16i1 truncate(v16i8))
+define fastcc <16 x i16> @test_v16i8_to_i16(<16 x i1> %0) {
+; CHECK-LE-LABEL: test_v16i8_to_i16:
+; CHECK-LE: # %bb.0: # %Entry
+; CHECK-LE-NEXT: plxv 35, .LCPI0_0 at PCREL(0), 1
+; CHECK-LE-NEXT: plxv 1, .LCPI0_1 at PCREL(0), 1
+; CHECK-LE-NEXT: li 4, 0
+; CHECK-LE-NEXT: mtfprd 0, 4
+; CHECK-LE-NEXT: vbpermq 2, 2, 3
+; CHECK-LE-NEXT: xxlxor 35, 35, 35
+; CHECK-LE-NEXT: mfvsrwz 3, 34
+; CHECK-LE-NEXT: mtvsrd 34, 3
+; CHECK-LE-NEXT: xxperm 34, 0, 1
+; CHECK-LE-NEXT: blr
+;
+; CHECK-BE-LABEL: test_v16i8_to_i16:
+; CHECK-BE: # %bb.0: # %Entry
+; CHECK-BE-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; CHECK-BE-NEXT: lxv 35, 0(3)
+; CHECK-BE-NEXT: vbpermq 2, 2, 3
+; CHECK-BE-NEXT: xxlxor 35, 35, 35
+; CHECK-BE-NEXT: mfvsrwz 3, 34
+; CHECK-BE-NEXT: mtfprwz 0, 3
+; CHECK-BE-NEXT: addis 3, 2, .LCPI0_1 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI0_1 at toc@l
+; CHECK-BE-NEXT: lxv 1, 0(3)
+; CHECK-BE-NEXT: li 3, 0
+; CHECK-BE-NEXT: mtvsrwz 34, 3
+; CHECK-BE-NEXT: xxperm 34, 0, 1
+; CHECK-BE-NEXT: blr
+Entry:
+ %1 = bitcast <16 x i1> %0 to i16
+ %2 = insertelement <16 x i16> zeroinitializer, i16 %1, i64 0
+ ret <16 x i16> %2
+}
+
+; i8 = bitcast(v8i1 truncate(v8i16))
+define fastcc <16 x i8> @test_v8i16_to_i8(<8 x i1> %0) {
+; CHECK-LE-LABEL: test_v8i16_to_i8:
+; CHECK-LE: # %bb.0: # %Entry
+; CHECK-LE-NEXT: plxv 35, .LCPI1_0 at PCREL(0), 1
+; CHECK-LE-NEXT: plxv 1, .LCPI1_1 at PCREL(0), 1
+; CHECK-LE-NEXT: li 4, 0
+; CHECK-LE-NEXT: mtfprd 0, 4
+; CHECK-LE-NEXT: vbpermq 2, 2, 3
+; CHECK-LE-NEXT: mfvsrd 3, 34
+; CHECK-LE-NEXT: rldicl 3, 3, 56, 8
+; CHECK-LE-NEXT: mtvsrd 34, 3
+; CHECK-LE-NEXT: xxperm 34, 0, 1
+; CHECK-LE-NEXT: blr
+;
+; CHECK-BE-LABEL: test_v8i16_to_i8:
+; CHECK-BE: # %bb.0: # %Entry
+; CHECK-BE-NEXT: addis 3, 2, .LCPI1_0 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI1_0 at toc@l
+; CHECK-BE-NEXT: lxv 35, 0(3)
+; CHECK-BE-NEXT: vbpermq 2, 2, 3
+; CHECK-BE-NEXT: mfvsrd 3, 34
+; CHECK-BE-NEXT: rldicl 3, 3, 56, 8
+; CHECK-BE-NEXT: mtfprwz 0, 3
+; CHECK-BE-NEXT: addis 3, 2, .LCPI1_1 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI1_1 at toc@l
+; CHECK-BE-NEXT: lxv 1, 0(3)
+; CHECK-BE-NEXT: li 3, 0
+; CHECK-BE-NEXT: mtvsrwz 34, 3
+; CHECK-BE-NEXT: xxperm 34, 0, 1
+; CHECK-BE-NEXT: blr
+Entry:
+ %2 = bitcast <8 x i1> %0 to i8
+ %3 = insertelement <16 x i8> zeroinitializer, i8 %2, i64 0
+ ret <16 x i8> %3
+}
+
+; i8 = bitcast(v8i1 truncate(v8i8))
+define i8 @test_v8i8_to_i8(<8 x i8> %a) {
+; CHECK-LE-LABEL: test_v8i8_to_i8:
+; CHECK-LE: # %bb.0:
+; CHECK-LE-NEXT: plxv 35, .LCPI2_0 at PCREL(0), 1
+; CHECK-LE-NEXT: vbpermd 2, 2, 3
+; CHECK-LE-NEXT: mfvsrld 3, 34
+; CHECK-LE-NEXT: blr
+;
+; CHECK-BE-LABEL: test_v8i8_to_i8:
+; CHECK-BE: # %bb.0:
+; CHECK-BE-NEXT: addis 3, 2, .LCPI2_0 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI2_0 at toc@l
+; CHECK-BE-NEXT: lxv 35, 0(3)
+; CHECK-BE-NEXT: vbpermd 2, 2, 3
+; CHECK-BE-NEXT: mfvsrd 3, 34
+; CHECK-BE-NEXT: blr
+ %1 = trunc <8 x i8> %a to <8 x i1>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
diff --git a/llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll b/llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll
deleted file mode 100644
index fd6a5d93784ea..0000000000000
--- a/llvm/test/CodeGen/PowerPC/mma-build-vector-v256i1.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=powerpc64le -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-LE
-; RUN: llc -mtriple=powerpc64 -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-BE
-
-define fastcc <16 x i16> @test(<16 x i1> %0) {
-; CHECK-LE-LABEL: test:
-; CHECK-LE: # %bb.0: # %Entry
-; CHECK-LE-NEXT: clrldi 0, 1, 59
-; CHECK-LE-NEXT: std 30, -16(1)
-; CHECK-LE-NEXT: mr 30, 1
-; CHECK-LE-NEXT: subfic 0, 0, -96
-; CHECK-LE-NEXT: stdux 1, 1, 0
-; CHECK-LE-NEXT: .cfi_def_cfa_register r30
-; CHECK-LE-NEXT: .cfi_offset r30, -16
-; CHECK-LE-NEXT: li 3, 1
-; CHECK-LE-NEXT: xxlxor 35, 35, 35
-; CHECK-LE-NEXT: vextubrx 4, 3, 2
-; CHECK-LE-NEXT: li 3, 0
-; CHECK-LE-NEXT: vextubrx 3, 3, 2
-; CHECK-LE-NEXT: clrldi 3, 3, 63
-; CHECK-LE-NEXT: rlwimi 3, 4, 1, 30, 30
-; CHECK-LE-NEXT: li 4, 2
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 2, 29, 29
-; CHECK-LE-NEXT: li 4, 3
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 3, 28, 28
-; CHECK-LE-NEXT: li 4, 4
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 4, 27, 27
-; CHECK-LE-NEXT: li 4, 5
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 5, 26, 26
-; CHECK-LE-NEXT: li 4, 6
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 6, 25, 25
-; CHECK-LE-NEXT: li 4, 7
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 7, 24, 24
-; CHECK-LE-NEXT: li 4, 8
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 8, 23, 23
-; CHECK-LE-NEXT: li 4, 9
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 9, 22, 22
-; CHECK-LE-NEXT: li 4, 10
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 10, 21, 21
-; CHECK-LE-NEXT: li 4, 11
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 11, 20, 20
-; CHECK-LE-NEXT: li 4, 12
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 12, 19, 19
-; CHECK-LE-NEXT: li 4, 13
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 13, 18, 18
-; CHECK-LE-NEXT: li 4, 14
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 14, 17, 17
-; CHECK-LE-NEXT: li 4, 15
-; CHECK-LE-NEXT: vextubrx 4, 4, 2
-; CHECK-LE-NEXT: rlwimi 3, 4, 15, 16, 16
-; CHECK-LE-NEXT: mtvsrdd 34, 0, 3
-; CHECK-LE-NEXT: mr 1, 30
-; CHECK-LE-NEXT: ld 30, -16(1)
-; CHECK-LE-NEXT: blr
-;
-; CHECK-BE-LABEL: test:
-; CHECK-BE: # %bb.0: # %Entry
-; CHECK-BE-NEXT: clrldi 0, 1, 59
-; CHECK-BE-NEXT: std 30, -16(1)
-; CHECK-BE-NEXT: mr 30, 1
-; CHECK-BE-NEXT: subfic 0, 0, -128
-; CHECK-BE-NEXT: stdux 1, 1, 0
-; CHECK-BE-NEXT: .cfi_def_cfa_register r30
-; CHECK-BE-NEXT: .cfi_offset r30, -16
-; CHECK-BE-NEXT: li 3, 1
-; CHECK-BE-NEXT: xxlxor 35, 35, 35
-; CHECK-BE-NEXT: vextublx 5, 3, 2
-; CHECK-BE-NEXT: li 3, 0
-; CHECK-BE-NEXT: vextublx 4, 3, 2
-; CHECK-BE-NEXT: clrldi 4, 4, 63
-; CHECK-BE-NEXT: rlwimi 4, 5, 1, 30, 30
-; CHECK-BE-NEXT: li 5, 2
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 2, 29, 29
-; CHECK-BE-NEXT: li 5, 3
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 3, 28, 28
-; CHECK-BE-NEXT: li 5, 4
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 4, 27, 27
-; CHECK-BE-NEXT: li 5, 5
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 5, 26, 26
-; CHECK-BE-NEXT: li 5, 6
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 6, 25, 25
-; CHECK-BE-NEXT: li 5, 7
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 7, 24, 24
-; CHECK-BE-NEXT: li 5, 8
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 8, 23, 23
-; CHECK-BE-NEXT: li 5, 9
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 9, 22, 22
-; CHECK-BE-NEXT: li 5, 10
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 10, 21, 21
-; CHECK-BE-NEXT: li 5, 11
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 11, 20, 20
-; CHECK-BE-NEXT: li 5, 12
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 12, 19, 19
-; CHECK-BE-NEXT: li 5, 13
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 13, 18, 18
-; CHECK-BE-NEXT: li 5, 14
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 14, 17, 17
-; CHECK-BE-NEXT: li 5, 15
-; CHECK-BE-NEXT: vextublx 5, 5, 2
-; CHECK-BE-NEXT: rlwimi 4, 5, 15, 16, 16
-; CHECK-BE-NEXT: mtvsrdd 34, 4, 3
-; CHECK-BE-NEXT: mr 1, 30
-; CHECK-BE-NEXT: ld 30, -16(1)
-; CHECK-BE-NEXT: blr
-Entry:
- %1 = bitcast <16 x i1> %0 to i16
- %2 = insertelement <16 x i16> zeroinitializer, i16 %1, i64 0
- ret <16 x i16> %2
-}
>From 4afdccf48aa29123e342babe0a7f5a4f63426d78 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo at ca.ibm.com>
Date: Tue, 24 Mar 2026 16:20:03 +0000
Subject: [PATCH 3/6] Address review comments
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 140 ++++++++----------
llvm/lib/Target/PowerPC/PPCISelLowering.h | 2 +
.../PowerPC/bitcast-truncate-vec-i1.ll | 120 ++++++++++++++-
3 files changed, 176 insertions(+), 86 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 22da7a994d482..dd477c2ca3641 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18230,80 +18230,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
case PPCISD::ADDC:
return DAGCombineAddc(N, DCI);
- case ISD::BITCAST: {
- // Optimize the following patterns using vbpermq/vbpermd:
- // i16 = bitcast(v16i1 truncate(v16i8))
- // i8 = bitcast(v8i1 truncate(v8i16))
- // i8 = bitcast(v8i1 truncate(v8i8))
- SDValue Op0 = N->getOperand(0);
- EVT ResVT = N->getValueType(0);
- if (Op0.getOpcode() != ISD::TRUNCATE)
- break;
-
- SDValue Src = Op0.getOperand(0);
- EVT SrcVT = Src.getValueType();
- bool IsV16i8 = (ResVT == MVT::i16 && SrcVT == MVT::v16i8);
- bool IsV8i16 = (ResVT == MVT::i8 && SrcVT == MVT::v8i16);
- bool IsV8i8 = (ResVT == MVT::i8 && SrcVT == MVT::v8i8);
- bool IsLE = Subtarget.isLittleEndian();
- unsigned EltIdx = IsLE ? 1 : 0;
-
- if (IsV16i8 || IsV8i16) {
- SDLoc dl(N);
- int NumElts = IsV16i8 ? 16 : 8;
- int EltSize = IsV16i8 ? 8 : 16;
-
- SmallVector<SDValue, 16> Ops;
- for (int i = 0; i < 16; ++i) {
- int ByteIdx = IsLE ? (15 - i) : i;
- int Index =
- (ByteIdx < NumElts) ? (ByteIdx * EltSize + (EltSize - 1)) : 128;
- Ops.push_back(DAG.getConstant(Index, dl, MVT::i8));
- }
-
- SDValue Indices = DAG.getBuildVector(MVT::v16i8, dl, Ops);
- SDValue VBPerm = DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
- DAG.getConstant(Intrinsic::ppc_altivec_vbpermq, dl, MVT::i32),
- DAG.getBitcast(MVT::v16i8, Src), Indices);
-
- SDValue V2i64 = DAG.getBitcast(MVT::v2i64, VBPerm);
- SDValue DW0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, V2i64,
- DAG.getIntPtrConstant(EltIdx, dl));
-
- if (IsV8i16)
- DW0 = DAG.getNode(ISD::SRL, dl, MVT::i64, DW0,
- DAG.getConstant(8, dl, MVT::i32));
-
- return DAG.getNode(ISD::TRUNCATE, dl, ResVT, DW0);
- }
-
- if (IsV8i8 && Subtarget.hasP9Vector()) {
- SDLoc dl(N);
- SmallVector<SDValue, 16> Ops;
- for (int i = 0; i < 16; ++i) {
- int ByteIdx = IsLE ? (7 - i) : i;
- int Index = (ByteIdx >= 0 && ByteIdx < 8) ? (ByteIdx * 8 + 7) : 128;
- Ops.push_back(DAG.getConstant(Index, dl, MVT::i8));
- }
-
- SDValue Indices = DAG.getBuildVector(MVT::v16i8, dl, Ops);
- SDValue Undef = DAG.getUNDEF(MVT::v16i8);
- SDValue V16i8Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i8,
- Undef, Src, DAG.getIntPtrConstant(0, dl));
-
- SDValue VBPermD = DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
- DAG.getConstant(Intrinsic::ppc_altivec_vbpermd, dl, MVT::i32),
- V16i8Src, Indices);
-
- SDValue DW0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
- DAG.getBitcast(MVT::v2i64, VBPermD),
- DAG.getIntPtrConstant(0, dl));
- return DAG.getNode(ISD::TRUNCATE, dl, ResVT, DW0);
- }
- break;
- }
+ case ISD::BITCAST:
+ return GenerateVBPERM(N, DCI);
}
return SDValue();
@@ -20744,11 +20672,67 @@ bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
return Subtarget.useCRBits();
}
-/// Targets can use this to indicate that they only support some
-/// VECTOR_SHUFFLE operations. PPC does not support shuffles on i1 element
-/// types, which are instead handled via DAG combine.
+/// Shuffle masks for vectors of bits are not legal as such vectors are
+/// reserved for MMA/DM.
bool PPCTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
if (VT.getScalarType() == MVT::i1)
return false;
- return true;
+ return TargetLowering::isShuffleMaskLegal(Mask, VT);
+}
+
+// Optimize the following patterns using vbpermq/vbpermd:
+// i16 = bitcast(v16i1 truncate(v16i8))
+// i8 = bitcast(v8i1 truncate(v8i16))
+// i8 = bitcast(v8i1 truncate(v8i8))
+SDValue PPCTargetLowering::GenerateVBPERM(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Op0 = N->getOperand(0);
+ if (Op0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue Src = Op0.getOperand(0);
+ EVT ResVT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+ bool IsV16i8 = (ResVT == MVT::i16 && SrcVT == MVT::v16i8);
+ bool IsV8i16 = (ResVT == MVT::i8 && SrcVT == MVT::v8i16);
+ bool IsV8i8 = (ResVT == MVT::i8 && SrcVT == MVT::v8i8);
+ unsigned EltIdx = 2;
+ bool IsLE = Subtarget.isLittleEndian();
+
+ if (!IsV16i8 && !IsV8i16 && !IsV8i8)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ if (IsV8i8) {
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i8,
+ DAG.getUNDEF(MVT::v16i8), Src,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SmallVector<int, 16> BitIndices(16, 128);
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ unsigned EltSize = SrcVT.getScalarType().getSizeInBits();
+ for (int Idx = 0, End = SrcVT.getVectorNumElements(); Idx < End; Idx++) {
+ BitIndices[Idx] = EltSize * (NumElts - Idx) - 1;
+ if (IsV8i8 && IsLE)
+ BitIndices[Idx] += 64;
+ }
+ if (!IsLE) {
+ std::reverse(BitIndices.begin(), BitIndices.end());
+ EltIdx = 1;
+ }
+
+ SmallVector<SDValue, 16> BVOps;
+ for (auto Idx : BitIndices)
+ BVOps.push_back(DAG.getConstant(Idx, dl, MVT::i8));
+ SDValue VRB = DAG.getBuildVector(MVT::v16i8, dl, BVOps);
+ SDValue VBPerm =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
+ DAG.getConstant(Intrinsic::ppc_altivec_vbpermq, dl, MVT::i32),
+ DAG.getBitcast(MVT::v16i8, Src), VRB);
+ SDValue ForExtract = DAG.getBitcast(MVT::v4i32, VBPerm);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, ForExtract,
+ DAG.getIntPtrConstant(EltIdx, dl));
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Extracted);
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index d3f959e15ff72..6365ba72baeea 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,6 +927,8 @@ namespace llvm {
bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
+ SDValue GenerateVBPERM(SDNode *N, DAGCombinerInfo &DCI) const;
+
/// getAddrModeForFlags - Based on the set of address flags, select the most
/// optimal instruction format to match by.
PPC::AddrMode getAddrModeForFlags(unsigned Flags) const;
diff --git a/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll b/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
index a35fd62e9cbcd..e89ff5c8c118c 100644
--- a/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
+++ b/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=powerpc64le -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-LE
; RUN: llc -mtriple=powerpc64 -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -mtriple=powerpc64 -mcpu=pwr7 < %s | FileCheck %s --check-prefix=CHECK-P7-BE
; i16 = bitcast(v16i1 truncate(v16i8))
define fastcc <16 x i16> @test_v16i8_to_i16(<16 x i1> %0) {
@@ -33,6 +34,55 @@ define fastcc <16 x i16> @test_v16i8_to_i16(<16 x i1> %0) {
; CHECK-BE-NEXT: mtvsrwz 34, 3
; CHECK-BE-NEXT: xxperm 34, 0, 1
; CHECK-BE-NEXT: blr
+;
+; CHECK-P7-BE-LABEL: test_v16i8_to_i16:
+; CHECK-P7-BE: # %bb.0: # %Entry
+; CHECK-P7-BE-NEXT: addi 3, 1, -32
+; CHECK-P7-BE-NEXT: xxlxor 36, 36, 36
+; CHECK-P7-BE-NEXT: stxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: lbz 3, -17(1)
+; CHECK-P7-BE-NEXT: lbz 4, -18(1)
+; CHECK-P7-BE-NEXT: clrlwi 3, 3, 31
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 1, 30, 30
+; CHECK-P7-BE-NEXT: lbz 4, -19(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 2, 29, 29
+; CHECK-P7-BE-NEXT: lbz 4, -20(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 3, 28, 28
+; CHECK-P7-BE-NEXT: lbz 4, -21(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 4, 27, 27
+; CHECK-P7-BE-NEXT: lbz 4, -22(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 5, 26, 26
+; CHECK-P7-BE-NEXT: lbz 4, -23(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 6, 25, 25
+; CHECK-P7-BE-NEXT: lbz 4, -24(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 7, 24, 24
+; CHECK-P7-BE-NEXT: lbz 4, -25(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 8, 23, 23
+; CHECK-P7-BE-NEXT: lbz 4, -26(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 9, 22, 22
+; CHECK-P7-BE-NEXT: lbz 4, -27(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 10, 21, 21
+; CHECK-P7-BE-NEXT: lbz 4, -28(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 11, 20, 20
+; CHECK-P7-BE-NEXT: lbz 4, -29(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 12, 19, 19
+; CHECK-P7-BE-NEXT: lbz 4, -30(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 13, 18, 18
+; CHECK-P7-BE-NEXT: lbz 4, -31(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 14, 17, 17
+; CHECK-P7-BE-NEXT: lbz 4, -32(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 15, 16, 16
+; CHECK-P7-BE-NEXT: sldi 3, 3, 48
+; CHECK-P7-BE-NEXT: std 3, -48(1)
+; CHECK-P7-BE-NEXT: std 3, -40(1)
+; CHECK-P7-BE-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; CHECK-P7-BE-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; CHECK-P7-BE-NEXT: lxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: addi 3, 1, -48
+; CHECK-P7-BE-NEXT: lxvw4x 35, 0, 3
+; CHECK-P7-BE-NEXT: vperm 2, 3, 4, 2
+; CHECK-P7-BE-NEXT: xxlxor 35, 35, 35
+; CHECK-P7-BE-NEXT: blr
Entry:
%1 = bitcast <16 x i1> %0 to i16
%2 = insertelement <16 x i16> zeroinitializer, i16 %1, i64 0
@@ -48,8 +98,7 @@ define fastcc <16 x i8> @test_v8i16_to_i8(<8 x i1> %0) {
; CHECK-LE-NEXT: li 4, 0
; CHECK-LE-NEXT: mtfprd 0, 4
; CHECK-LE-NEXT: vbpermq 2, 2, 3
-; CHECK-LE-NEXT: mfvsrd 3, 34
-; CHECK-LE-NEXT: rldicl 3, 3, 56, 8
+; CHECK-LE-NEXT: mfvsrwz 3, 34
; CHECK-LE-NEXT: mtvsrd 34, 3
; CHECK-LE-NEXT: xxperm 34, 0, 1
; CHECK-LE-NEXT: blr
@@ -60,8 +109,7 @@ define fastcc <16 x i8> @test_v8i16_to_i8(<8 x i1> %0) {
; CHECK-BE-NEXT: addi 3, 3, .LCPI1_0 at toc@l
; CHECK-BE-NEXT: lxv 35, 0(3)
; CHECK-BE-NEXT: vbpermq 2, 2, 3
-; CHECK-BE-NEXT: mfvsrd 3, 34
-; CHECK-BE-NEXT: rldicl 3, 3, 56, 8
+; CHECK-BE-NEXT: mfvsrwz 3, 34
; CHECK-BE-NEXT: mtfprwz 0, 3
; CHECK-BE-NEXT: addis 3, 2, .LCPI1_1 at toc@ha
; CHECK-BE-NEXT: addi 3, 3, .LCPI1_1 at toc@l
@@ -70,6 +118,38 @@ define fastcc <16 x i8> @test_v8i16_to_i8(<8 x i1> %0) {
; CHECK-BE-NEXT: mtvsrwz 34, 3
; CHECK-BE-NEXT: xxperm 34, 0, 1
; CHECK-BE-NEXT: blr
+;
+; CHECK-P7-BE-LABEL: test_v8i16_to_i8:
+; CHECK-P7-BE: # %bb.0: # %Entry
+; CHECK-P7-BE-NEXT: addi 3, 1, -32
+; CHECK-P7-BE-NEXT: xxlxor 36, 36, 36
+; CHECK-P7-BE-NEXT: stxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: lhz 3, -18(1)
+; CHECK-P7-BE-NEXT: lhz 4, -20(1)
+; CHECK-P7-BE-NEXT: clrlwi 3, 3, 31
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 1, 30, 30
+; CHECK-P7-BE-NEXT: lhz 4, -22(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 2, 29, 29
+; CHECK-P7-BE-NEXT: lhz 4, -24(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 3, 28, 28
+; CHECK-P7-BE-NEXT: lhz 4, -26(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 4, 27, 27
+; CHECK-P7-BE-NEXT: lhz 4, -28(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 5, 26, 26
+; CHECK-P7-BE-NEXT: lhz 4, -30(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 6, 25, 25
+; CHECK-P7-BE-NEXT: lhz 4, -32(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 7, 24, 24
+; CHECK-P7-BE-NEXT: sldi 3, 3, 56
+; CHECK-P7-BE-NEXT: std 3, -48(1)
+; CHECK-P7-BE-NEXT: std 3, -40(1)
+; CHECK-P7-BE-NEXT: addis 3, 2, .LCPI1_0 at toc@ha
+; CHECK-P7-BE-NEXT: addi 3, 3, .LCPI1_0 at toc@l
+; CHECK-P7-BE-NEXT: lxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: addi 3, 1, -48
+; CHECK-P7-BE-NEXT: lxvw4x 35, 0, 3
+; CHECK-P7-BE-NEXT: vperm 2, 3, 4, 2
+; CHECK-P7-BE-NEXT: blr
Entry:
%2 = bitcast <8 x i1> %0 to i8
%3 = insertelement <16 x i8> zeroinitializer, i8 %2, i64 0
@@ -81,8 +161,8 @@ define i8 @test_v8i8_to_i8(<8 x i8> %a) {
; CHECK-LE-LABEL: test_v8i8_to_i8:
; CHECK-LE: # %bb.0:
; CHECK-LE-NEXT: plxv 35, .LCPI2_0 at PCREL(0), 1
-; CHECK-LE-NEXT: vbpermd 2, 2, 3
-; CHECK-LE-NEXT: mfvsrld 3, 34
+; CHECK-LE-NEXT: vbpermq 2, 2, 3
+; CHECK-LE-NEXT: mfvsrwz 3, 34
; CHECK-LE-NEXT: blr
;
; CHECK-BE-LABEL: test_v8i8_to_i8:
@@ -90,9 +170,33 @@ define i8 @test_v8i8_to_i8(<8 x i8> %a) {
; CHECK-BE-NEXT: addis 3, 2, .LCPI2_0 at toc@ha
; CHECK-BE-NEXT: addi 3, 3, .LCPI2_0 at toc@l
; CHECK-BE-NEXT: lxv 35, 0(3)
-; CHECK-BE-NEXT: vbpermd 2, 2, 3
-; CHECK-BE-NEXT: mfvsrd 3, 34
+; CHECK-BE-NEXT: vbpermq 2, 2, 3
+; CHECK-BE-NEXT: mfvsrwz 3, 34
; CHECK-BE-NEXT: blr
+;
+; CHECK-P7-BE-LABEL: test_v8i8_to_i8:
+; CHECK-P7-BE: # %bb.0:
+; CHECK-P7-BE-NEXT: addi 3, 1, -32
+; CHECK-P7-BE-NEXT: stxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: lbz 3, -25(1)
+; CHECK-P7-BE-NEXT: lbz 4, -26(1)
+; CHECK-P7-BE-NEXT: clrlwi 3, 3, 31
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 1, 30, 30
+; CHECK-P7-BE-NEXT: lbz 4, -27(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 2, 29, 29
+; CHECK-P7-BE-NEXT: lbz 4, -28(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 3, 28, 28
+; CHECK-P7-BE-NEXT: lbz 4, -29(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 4, 27, 27
+; CHECK-P7-BE-NEXT: lbz 4, -30(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 5, 26, 26
+; CHECK-P7-BE-NEXT: lbz 4, -31(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 6, 25, 25
+; CHECK-P7-BE-NEXT: lbz 4, -32(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 7, 24, 24
+; CHECK-P7-BE-NEXT: stb 3, -1(1)
+; CHECK-P7-BE-NEXT: lbz 3, -1(1)
+; CHECK-P7-BE-NEXT: blr
%1 = trunc <8 x i8> %a to <8 x i1>
%2 = bitcast <8 x i1> %1 to i8
ret i8 %2
>From ec9d094796fb2fe941e96c980c6138c69db94ab2 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo at ca.ibm.com>
Date: Tue, 31 Mar 2026 18:03:15 +0000
Subject: [PATCH 4/6] Rename GenerateVBPERM to DAGCombineBitcast
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 6 +++---
llvm/lib/Target/PowerPC/PPCISelLowering.h | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index dd477c2ca3641..d6e469ed58063 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18231,7 +18231,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return DAGCombineAddc(N, DCI);
case ISD::BITCAST:
- return GenerateVBPERM(N, DCI);
+ return DAGCombineBitcast(N, DCI);
}
return SDValue();
@@ -20684,8 +20684,8 @@ bool PPCTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
// i16 = bitcast(v16i1 truncate(v16i8))
// i8 = bitcast(v8i1 truncate(v8i16))
// i8 = bitcast(v8i1 truncate(v8i8))
-SDValue PPCTargetLowering::GenerateVBPERM(SDNode *N,
- DAGCombinerInfo &DCI) const {
+SDValue PPCTargetLowering::DAGCombineBitcast(SDNode *N,
+ DAGCombinerInfo &DCI) const {
SDValue Op0 = N->getOperand(0);
if (Op0.getOpcode() != ISD::TRUNCATE)
return SDValue();
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 6365ba72baeea..62db481c04859 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,7 +927,7 @@ namespace llvm {
bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
- SDValue GenerateVBPERM(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue DAGCombineBitcast(SDNode *N, DAGCombinerInfo &DCI) const;
/// getAddrModeForFlags - Based on the set of address flags, select the most
/// optimal instruction format to match by.
>From ef2d6df5f5f0e2599008dac1038cc75f90a2fa0a Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo at ca.ibm.com>
Date: Tue, 31 Mar 2026 21:23:45 +0000
Subject: [PATCH 5/6] Extract vbpermq generation into a separate helper
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 15 +++++++++------
llvm/lib/Target/PowerPC/PPCISelLowering.h | 2 ++
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d6e469ed58063..41cb677038a9e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -20689,21 +20689,25 @@ SDValue PPCTargetLowering::DAGCombineBitcast(SDNode *N,
SDValue Op0 = N->getOperand(0);
if (Op0.getOpcode() != ISD::TRUNCATE)
return SDValue();
-
SDValue Src = Op0.getOperand(0);
EVT ResVT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
+ SDLoc dl(N);
+ return GenerateVBPERM(DCI.DAG, dl, Src, SrcVT, ResVT,
+ Subtarget.isLittleEndian());
+}
+
+SDValue PPCTargetLowering::GenerateVBPERM(SelectionDAG &DAG, SDLoc dl,
+ SDValue Src, EVT SrcVT, EVT ResVT,
+ bool IsLE) const {
bool IsV16i8 = (ResVT == MVT::i16 && SrcVT == MVT::v16i8);
bool IsV8i16 = (ResVT == MVT::i8 && SrcVT == MVT::v8i16);
bool IsV8i8 = (ResVT == MVT::i8 && SrcVT == MVT::v8i8);
- unsigned EltIdx = 2;
- bool IsLE = Subtarget.isLittleEndian();
if (!IsV16i8 && !IsV8i16 && !IsV8i8)
return SDValue();
- SelectionDAG &DAG = DCI.DAG;
- SDLoc dl(N);
+ unsigned EltIdx = 2;
if (IsV8i8) {
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i8,
DAG.getUNDEF(MVT::v16i8), Src,
@@ -20721,7 +20725,6 @@ SDValue PPCTargetLowering::DAGCombineBitcast(SDNode *N,
std::reverse(BitIndices.begin(), BitIndices.end());
EltIdx = 1;
}
-
SmallVector<SDValue, 16> BVOps;
for (auto Idx : BitIndices)
BVOps.push_back(DAG.getConstant(Idx, dl, MVT::i8));
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 62db481c04859..7077d7fa486e1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -928,6 +928,8 @@ namespace llvm {
bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
SDValue DAGCombineBitcast(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue GenerateVBPERM(SelectionDAG &DAG, SDLoc dl, SDValue Src, EVT SrcVT,
+ EVT ResVT, bool IsLE) const;
/// getAddrModeForFlags - Based on the set of address flags, select the most
/// optimal instruction format to match by.
>From 16ed0611a651df4b9e64cedbe31dd52a88cb5cc4 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo at ca.ibm.com>
Date: Wed, 1 Apr 2026 19:15:43 +0000
Subject: [PATCH 6/6] Refactor GenerateVBPERM to return the intrinsic result
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 31 +++++++++++----------
1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 41cb677038a9e..9e0daadacaa5d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -20693,8 +20693,17 @@ SDValue PPCTargetLowering::DAGCombineBitcast(SDNode *N,
EVT ResVT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
SDLoc dl(N);
- return GenerateVBPERM(DCI.DAG, dl, Src, SrcVT, ResVT,
- Subtarget.isLittleEndian());
+ SelectionDAG &DAG = DCI.DAG;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
+
+ SDValue VBPerm = GenerateVBPERM(DAG, dl, Src, SrcVT, ResVT, IsLittleEndian);
+ if (!VBPerm)
+ return SDValue();
+ SDValue ForExtract = DAG.getBitcast(MVT::v4i32, VBPerm);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, ForExtract,
+ DAG.getIntPtrConstant(IsLittleEndian ? 2 : 1, dl));
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Extracted);
}
SDValue PPCTargetLowering::GenerateVBPERM(SelectionDAG &DAG, SDLoc dl,
@@ -20707,7 +20716,6 @@ SDValue PPCTargetLowering::GenerateVBPERM(SelectionDAG &DAG, SDLoc dl,
if (!IsV16i8 && !IsV8i16 && !IsV8i8)
return SDValue();
- unsigned EltIdx = 2;
if (IsV8i8) {
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i8,
DAG.getUNDEF(MVT::v16i8), Src,
@@ -20721,21 +20729,14 @@ SDValue PPCTargetLowering::GenerateVBPERM(SelectionDAG &DAG, SDLoc dl,
if (IsV8i8 && IsLE)
BitIndices[Idx] += 64;
}
- if (!IsLE) {
+ if (!IsLE)
std::reverse(BitIndices.begin(), BitIndices.end());
- EltIdx = 1;
- }
SmallVector<SDValue, 16> BVOps;
for (auto Idx : BitIndices)
BVOps.push_back(DAG.getConstant(Idx, dl, MVT::i8));
SDValue VRB = DAG.getBuildVector(MVT::v16i8, dl, BVOps);
- SDValue VBPerm =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
- DAG.getConstant(Intrinsic::ppc_altivec_vbpermq, dl, MVT::i32),
- DAG.getBitcast(MVT::v16i8, Src), VRB);
- SDValue ForExtract = DAG.getBitcast(MVT::v4i32, VBPerm);
- SDValue Extracted =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, ForExtract,
- DAG.getIntPtrConstant(EltIdx, dl));
- return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Extracted);
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
+ DAG.getConstant(Intrinsic::ppc_altivec_vbpermq, dl, MVT::i32),
+ DAG.getBitcast(MVT::v16i8, Src), VRB);
}
More information about the llvm-commits
mailing list