[llvm-branch-commits] [llvm] release/22.x: [PowerPC] Optimize bitcast(truncate) patterns using vbpermq (#181233) (PR #190802)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Apr 7 08:17:01 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: llvmbot
<details>
<summary>Changes</summary>
Backport 668938917493fe05c98d5b725f68dfd17ab8eb2f
Requested by: @<!-- -->alexrp
---
Full diff: https://github.com/llvm/llvm-project/pull/190802.diff
3 Files Affected:
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+79)
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.h (+6)
- (added) llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll (+203)
``````````diff
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f818cce380632..bdba040529d00 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1452,6 +1452,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
}
+ if (Subtarget.hasP8Vector())
+ setTargetDAGCombine(ISD::BITCAST);
+
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
if (Subtarget.useCRBits()) {
@@ -17933,6 +17936,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return DAGCombineBuildVector(N, DCI);
case PPCISD::ADDC:
return DAGCombineAddc(N, DCI);
+
+ case ISD::BITCAST:
+ return DAGCombineBitcast(N, DCI);
}
return SDValue();
@@ -20369,3 +20375,76 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
return Subtarget.useCRBits();
}
+
+/// Shuffle masks for vectors of bits are not legal as such vectors are
+/// reserved for MMA/DM.
+bool PPCTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
+ if (VT.getScalarType() == MVT::i1)
+ return false;
+ return TargetLowering::isShuffleMaskLegal(Mask, VT);
+}
+
+// Optimize the following patterns using vbpermq/vbpermd:
+// i16 = bitcast(v16i1 truncate(v16i8))
+// i8 = bitcast(v8i1 truncate(v8i16))
+// i8 = bitcast(v8i1 truncate(v8i8))
+SDValue PPCTargetLowering::DAGCombineBitcast(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Op0 = N->getOperand(0);
+ if (Op0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+ SDValue Src = Op0.getOperand(0);
+ EVT ResVT = N->getValueType(0);
+ EVT TruncResVT = Op0.getValueType();
+ EVT SrcVT = Src.getValueType();
+ SDLoc dl(N);
+ SelectionDAG &DAG = DCI.DAG;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
+
+ if (ResVT != MVT::i16 && ResVT != MVT::i8)
+ return SDValue();
+ SDValue VBPerm =
+ GenerateVBPERM(DAG, dl, Src, SrcVT, TruncResVT, IsLittleEndian);
+ if (!VBPerm)
+ return SDValue();
+ SDValue ForExtract = DAG.getBitcast(MVT::v4i32, VBPerm);
+ SDValue Extracted =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, ForExtract,
+ DAG.getIntPtrConstant(IsLittleEndian ? 2 : 1, dl));
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Extracted);
+}
+
+SDValue PPCTargetLowering::GenerateVBPERM(SelectionDAG &DAG, SDLoc dl,
+ SDValue Src, EVT SrcVT, EVT ResVT,
+ bool IsLE) const {
+ bool IsV16i8 = (ResVT == MVT::v16i1 && SrcVT == MVT::v16i8);
+ bool IsV8i16 = (ResVT == MVT::v8i1 && SrcVT == MVT::v8i16);
+ bool IsV8i8 = (ResVT == MVT::v8i1 && SrcVT == MVT::v8i8);
+
+ if (!IsV16i8 && !IsV8i16 && !IsV8i8)
+ return SDValue();
+
+ if (IsV8i8) {
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i8,
+ DAG.getUNDEF(MVT::v16i8), Src,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SmallVector<int, 16> BitIndices(16, 128);
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ unsigned EltSize = SrcVT.getScalarType().getSizeInBits();
+ for (int Idx = 0, End = SrcVT.getVectorNumElements(); Idx < End; Idx++) {
+ BitIndices[Idx] = EltSize * (NumElts - Idx) - 1;
+ if (IsV8i8 && IsLE)
+ BitIndices[Idx] += 64;
+ }
+ if (!IsLE)
+ std::reverse(BitIndices.begin(), BitIndices.end());
+ SmallVector<SDValue, 16> BVOps;
+ for (auto Idx : BitIndices)
+ BVOps.push_back(DAG.getConstant(Idx, dl, MVT::i8));
+ SDValue VRB = DAG.getBuildVector(MVT::v16i8, dl, BVOps);
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
+ DAG.getConstant(Intrinsic::ppc_altivec_vbpermq, dl, MVT::i32),
+ DAG.getBitcast(MVT::v16i8, Src), VRB);
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index daae839479c3c..9fd9fb1c07389 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -921,6 +921,12 @@ namespace llvm {
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+ bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
+
+ SDValue DAGCombineBitcast(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue GenerateVBPERM(SelectionDAG &DAG, SDLoc dl, SDValue Src, EVT SrcVT,
+ EVT ResVT, bool IsLE) const;
+
/// getAddrModeForFlags - Based on the set of address flags, select the most
/// optimal instruction format to match by.
PPC::AddrMode getAddrModeForFlags(unsigned Flags) const;
diff --git a/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll b/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
new file mode 100644
index 0000000000000..e89ff5c8c118c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/bitcast-truncate-vec-i1.ll
@@ -0,0 +1,203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=powerpc64le -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-LE
+; RUN: llc -mtriple=powerpc64 -mcpu=pwr10 < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -mtriple=powerpc64 -mcpu=pwr7 < %s | FileCheck %s --check-prefix=CHECK-P7-BE
+
+; i16 = bitcast(v16i1 truncate(v16i8))
+define fastcc <16 x i16> @test_v16i8_to_i16(<16 x i1> %0) {
+; CHECK-LE-LABEL: test_v16i8_to_i16:
+; CHECK-LE: # %bb.0: # %Entry
+; CHECK-LE-NEXT: plxv 35, .LCPI0_0 at PCREL(0), 1
+; CHECK-LE-NEXT: plxv 1, .LCPI0_1 at PCREL(0), 1
+; CHECK-LE-NEXT: li 4, 0
+; CHECK-LE-NEXT: mtfprd 0, 4
+; CHECK-LE-NEXT: vbpermq 2, 2, 3
+; CHECK-LE-NEXT: xxlxor 35, 35, 35
+; CHECK-LE-NEXT: mfvsrwz 3, 34
+; CHECK-LE-NEXT: mtvsrd 34, 3
+; CHECK-LE-NEXT: xxperm 34, 0, 1
+; CHECK-LE-NEXT: blr
+;
+; CHECK-BE-LABEL: test_v16i8_to_i16:
+; CHECK-BE: # %bb.0: # %Entry
+; CHECK-BE-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; CHECK-BE-NEXT: lxv 35, 0(3)
+; CHECK-BE-NEXT: vbpermq 2, 2, 3
+; CHECK-BE-NEXT: xxlxor 35, 35, 35
+; CHECK-BE-NEXT: mfvsrwz 3, 34
+; CHECK-BE-NEXT: mtfprwz 0, 3
+; CHECK-BE-NEXT: addis 3, 2, .LCPI0_1 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI0_1 at toc@l
+; CHECK-BE-NEXT: lxv 1, 0(3)
+; CHECK-BE-NEXT: li 3, 0
+; CHECK-BE-NEXT: mtvsrwz 34, 3
+; CHECK-BE-NEXT: xxperm 34, 0, 1
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P7-BE-LABEL: test_v16i8_to_i16:
+; CHECK-P7-BE: # %bb.0: # %Entry
+; CHECK-P7-BE-NEXT: addi 3, 1, -32
+; CHECK-P7-BE-NEXT: xxlxor 36, 36, 36
+; CHECK-P7-BE-NEXT: stxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: lbz 3, -17(1)
+; CHECK-P7-BE-NEXT: lbz 4, -18(1)
+; CHECK-P7-BE-NEXT: clrlwi 3, 3, 31
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 1, 30, 30
+; CHECK-P7-BE-NEXT: lbz 4, -19(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 2, 29, 29
+; CHECK-P7-BE-NEXT: lbz 4, -20(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 3, 28, 28
+; CHECK-P7-BE-NEXT: lbz 4, -21(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 4, 27, 27
+; CHECK-P7-BE-NEXT: lbz 4, -22(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 5, 26, 26
+; CHECK-P7-BE-NEXT: lbz 4, -23(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 6, 25, 25
+; CHECK-P7-BE-NEXT: lbz 4, -24(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 7, 24, 24
+; CHECK-P7-BE-NEXT: lbz 4, -25(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 8, 23, 23
+; CHECK-P7-BE-NEXT: lbz 4, -26(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 9, 22, 22
+; CHECK-P7-BE-NEXT: lbz 4, -27(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 10, 21, 21
+; CHECK-P7-BE-NEXT: lbz 4, -28(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 11, 20, 20
+; CHECK-P7-BE-NEXT: lbz 4, -29(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 12, 19, 19
+; CHECK-P7-BE-NEXT: lbz 4, -30(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 13, 18, 18
+; CHECK-P7-BE-NEXT: lbz 4, -31(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 14, 17, 17
+; CHECK-P7-BE-NEXT: lbz 4, -32(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 15, 16, 16
+; CHECK-P7-BE-NEXT: sldi 3, 3, 48
+; CHECK-P7-BE-NEXT: std 3, -48(1)
+; CHECK-P7-BE-NEXT: std 3, -40(1)
+; CHECK-P7-BE-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; CHECK-P7-BE-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; CHECK-P7-BE-NEXT: lxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: addi 3, 1, -48
+; CHECK-P7-BE-NEXT: lxvw4x 35, 0, 3
+; CHECK-P7-BE-NEXT: vperm 2, 3, 4, 2
+; CHECK-P7-BE-NEXT: xxlxor 35, 35, 35
+; CHECK-P7-BE-NEXT: blr
+Entry:
+ %1 = bitcast <16 x i1> %0 to i16
+ %2 = insertelement <16 x i16> zeroinitializer, i16 %1, i64 0
+ ret <16 x i16> %2
+}
+
+; i8 = bitcast(v8i1 truncate(v8i16))
+define fastcc <16 x i8> @test_v8i16_to_i8(<8 x i1> %0) {
+; CHECK-LE-LABEL: test_v8i16_to_i8:
+; CHECK-LE: # %bb.0: # %Entry
+; CHECK-LE-NEXT: plxv 35, .LCPI1_0 at PCREL(0), 1
+; CHECK-LE-NEXT: plxv 1, .LCPI1_1 at PCREL(0), 1
+; CHECK-LE-NEXT: li 4, 0
+; CHECK-LE-NEXT: mtfprd 0, 4
+; CHECK-LE-NEXT: vbpermq 2, 2, 3
+; CHECK-LE-NEXT: mfvsrwz 3, 34
+; CHECK-LE-NEXT: mtvsrd 34, 3
+; CHECK-LE-NEXT: xxperm 34, 0, 1
+; CHECK-LE-NEXT: blr
+;
+; CHECK-BE-LABEL: test_v8i16_to_i8:
+; CHECK-BE: # %bb.0: # %Entry
+; CHECK-BE-NEXT: addis 3, 2, .LCPI1_0 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI1_0 at toc@l
+; CHECK-BE-NEXT: lxv 35, 0(3)
+; CHECK-BE-NEXT: vbpermq 2, 2, 3
+; CHECK-BE-NEXT: mfvsrwz 3, 34
+; CHECK-BE-NEXT: mtfprwz 0, 3
+; CHECK-BE-NEXT: addis 3, 2, .LCPI1_1 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI1_1 at toc@l
+; CHECK-BE-NEXT: lxv 1, 0(3)
+; CHECK-BE-NEXT: li 3, 0
+; CHECK-BE-NEXT: mtvsrwz 34, 3
+; CHECK-BE-NEXT: xxperm 34, 0, 1
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P7-BE-LABEL: test_v8i16_to_i8:
+; CHECK-P7-BE: # %bb.0: # %Entry
+; CHECK-P7-BE-NEXT: addi 3, 1, -32
+; CHECK-P7-BE-NEXT: xxlxor 36, 36, 36
+; CHECK-P7-BE-NEXT: stxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: lhz 3, -18(1)
+; CHECK-P7-BE-NEXT: lhz 4, -20(1)
+; CHECK-P7-BE-NEXT: clrlwi 3, 3, 31
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 1, 30, 30
+; CHECK-P7-BE-NEXT: lhz 4, -22(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 2, 29, 29
+; CHECK-P7-BE-NEXT: lhz 4, -24(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 3, 28, 28
+; CHECK-P7-BE-NEXT: lhz 4, -26(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 4, 27, 27
+; CHECK-P7-BE-NEXT: lhz 4, -28(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 5, 26, 26
+; CHECK-P7-BE-NEXT: lhz 4, -30(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 6, 25, 25
+; CHECK-P7-BE-NEXT: lhz 4, -32(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 7, 24, 24
+; CHECK-P7-BE-NEXT: sldi 3, 3, 56
+; CHECK-P7-BE-NEXT: std 3, -48(1)
+; CHECK-P7-BE-NEXT: std 3, -40(1)
+; CHECK-P7-BE-NEXT: addis 3, 2, .LCPI1_0 at toc@ha
+; CHECK-P7-BE-NEXT: addi 3, 3, .LCPI1_0 at toc@l
+; CHECK-P7-BE-NEXT: lxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: addi 3, 1, -48
+; CHECK-P7-BE-NEXT: lxvw4x 35, 0, 3
+; CHECK-P7-BE-NEXT: vperm 2, 3, 4, 2
+; CHECK-P7-BE-NEXT: blr
+Entry:
+ %2 = bitcast <8 x i1> %0 to i8
+ %3 = insertelement <16 x i8> zeroinitializer, i8 %2, i64 0
+ ret <16 x i8> %3
+}
+
+; i8 = bitcast(v8i1 truncate(v8i8))
+define i8 @test_v8i8_to_i8(<8 x i8> %a) {
+; CHECK-LE-LABEL: test_v8i8_to_i8:
+; CHECK-LE: # %bb.0:
+; CHECK-LE-NEXT: plxv 35, .LCPI2_0 at PCREL(0), 1
+; CHECK-LE-NEXT: vbpermq 2, 2, 3
+; CHECK-LE-NEXT: mfvsrwz 3, 34
+; CHECK-LE-NEXT: blr
+;
+; CHECK-BE-LABEL: test_v8i8_to_i8:
+; CHECK-BE: # %bb.0:
+; CHECK-BE-NEXT: addis 3, 2, .LCPI2_0 at toc@ha
+; CHECK-BE-NEXT: addi 3, 3, .LCPI2_0 at toc@l
+; CHECK-BE-NEXT: lxv 35, 0(3)
+; CHECK-BE-NEXT: vbpermq 2, 2, 3
+; CHECK-BE-NEXT: mfvsrwz 3, 34
+; CHECK-BE-NEXT: blr
+;
+; CHECK-P7-BE-LABEL: test_v8i8_to_i8:
+; CHECK-P7-BE: # %bb.0:
+; CHECK-P7-BE-NEXT: addi 3, 1, -32
+; CHECK-P7-BE-NEXT: stxvw4x 34, 0, 3
+; CHECK-P7-BE-NEXT: lbz 3, -25(1)
+; CHECK-P7-BE-NEXT: lbz 4, -26(1)
+; CHECK-P7-BE-NEXT: clrlwi 3, 3, 31
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 1, 30, 30
+; CHECK-P7-BE-NEXT: lbz 4, -27(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 2, 29, 29
+; CHECK-P7-BE-NEXT: lbz 4, -28(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 3, 28, 28
+; CHECK-P7-BE-NEXT: lbz 4, -29(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 4, 27, 27
+; CHECK-P7-BE-NEXT: lbz 4, -30(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 5, 26, 26
+; CHECK-P7-BE-NEXT: lbz 4, -31(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 6, 25, 25
+; CHECK-P7-BE-NEXT: lbz 4, -32(1)
+; CHECK-P7-BE-NEXT: rlwimi 3, 4, 7, 24, 24
+; CHECK-P7-BE-NEXT: stb 3, -1(1)
+; CHECK-P7-BE-NEXT: lbz 3, -1(1)
+; CHECK-P7-BE-NEXT: blr
+ %1 = trunc <8 x i8> %a to <8 x i1>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/190802
More information about the llvm-branch-commits
mailing list