[llvm] [LoongArch] Add demanded bits support for [X]VMSKLTZ (PR #143528)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 10 06:06:54 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-loongarch
Author: hev (heiher)
<details>
<summary>Changes</summary>
This patch adds a DAG combine hook for the [X]VMSKLTZ nodes to simplify their input when possible. It also implements target-specific logic in SimplifyDemandedBitsForTargetNode to optimize away unnecessary computations when only a subset of the sign bits in the vector results is actually used.
---
Full diff: https://github.com/llvm/llvm-project/pull/143528.diff
4 Files Affected:
- (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+71)
- (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.h (+6)
- (modified) llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll (+3-12)
- (modified) llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll (-16)
``````````diff
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index b869ad25e7852..bffbeeb4b9be3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5634,6 +5634,21 @@ static SDValue performMOVFR2GR_SCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue performVMSKLTZCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumBits = VT.getScalarSizeInBits();
+
+ // Simplify the inputs.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnes(NumBits));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -5658,6 +5673,9 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
return performMOVGR2FR_WCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::MOVFR2GR_S_LA64:
return performMOVFR2GR_SCombine(N, DAG, DCI, Subtarget);
+ case LoongArchISD::VMSKLTZ:
+ case LoongArchISD::XVMSKLTZ:
+ return performVMSKLTZCombine(N, DAG, DCI, Subtarget);
}
return SDValue();
}
@@ -8192,3 +8210,56 @@ unsigned LoongArchTargetLowering::getNumRegistersForCallingConv(
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
+
+bool LoongArchTargetLowering::SimplifyDemandedBitsForTargetNode(
+ SDValue Op, const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+ unsigned Depth) const {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = OriginalDemandedBits.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case LoongArchISD::VMSKLTZ:
+ case LoongArchISD::XVMSKLTZ: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ unsigned NumElts = SrcVT.getVectorNumElements();
+
+ // If we don't need the sign bits at all just return zero.
+ if (OriginalDemandedBits.countr_zero() >= NumElts)
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ // Only demand the vector elements of the sign bits we need.
+ APInt KnownUndef, KnownZero;
+ APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
+ if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+ TLO, Depth + 1))
+ return true;
+
+ Known.Zero = KnownZero.zext(BitWidth);
+ Known.Zero.setHighBits(BitWidth - NumElts);
+
+ // [X]VMSKLTZ only uses the MSB from each vector element.
+ KnownBits KnownSrc;
+ APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+ if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+ Depth + 1))
+ return true;
+
+ if (KnownSrc.One[SrcBits - 1])
+ Known.One.setLowBits(NumElts);
+ else if (KnownSrc.Zero[SrcBits - 1])
+ Known.Zero.setLowBits(NumElts);
+
+ // Attempt to avoid multi-use os if we don't need anything from it.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+ Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
+ return false;
+ }
+ }
+
+ return TargetLowering::SimplifyDemandedBitsForTargetNode(
+ Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 53e3f1adb8d27..79aa89726191b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -314,6 +314,12 @@ class LoongArchTargetLowering : public TargetLowering {
bool isFPImmVLDILegal(const APFloat &Imm, EVT VT) const;
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+ bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ KnownBits &Known,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
+
private:
/// Target-specific function used to lower LoongArch calling conventions.
typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
diff --git a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
index 7e015852e0abc..5a861be95977d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
@@ -383,9 +383,8 @@ define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
; CHECK-LABEL: xvmsk_eq_vsel_slt_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1
-; CHECK-NEXT: xvslti.w $xr1, $xr2, 0
-; CHECK-NEXT: xvrepli.b $xr2, -1
-; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvbitsel.v $xr0, $xr2, $xr1, $xr0
; CHECK-NEXT: xvmskltz.w $xr0, $xr0
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -408,8 +407,7 @@ define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
; CHECK-NEXT: xvreplgr2vr.w $xr4, $a0
; CHECK-NEXT: xvand.v $xr2, $xr2, $xr4
; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1
-; CHECK-NEXT: xvslti.w $xr1, $xr3, 0
-; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT: xvor.v $xr0, $xr3, $xr0
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2
; CHECK-NEXT: xvmskltz.w $xr0, $xr0
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
@@ -530,7 +528,6 @@ define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) {
; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
; CHECK-NEXT: vpackev.h $vr0, $vr0, $vr1
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
-; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
; CHECK-NEXT: vmskltz.h $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -558,7 +555,6 @@ define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) {
; CHECK-NEXT: st.h $a0, $sp, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
-; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
; CHECK-NEXT: vmskltz.h $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -586,7 +582,6 @@ define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) {
; CHECK-NEXT: st.h $a0, $sp, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
-; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
; CHECK-NEXT: vmskltz.h $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -601,7 +596,6 @@ define i32 @xvmsk_trunc_i8(<32 x i8> %a) {
; CHECK-LABEL: xvmsk_trunc_i8:
; CHECK: # %bb.0:
; CHECK-NEXT: xvslli.b $xr0, $xr0, 7
-; CHECK-NEXT: xvsrai.b $xr0, $xr0, 7
; CHECK-NEXT: xvmskltz.b $xr0, $xr0
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -616,7 +610,6 @@ define i16 @xvmsk_trunc_i16(<16 x i16> %a) {
; CHECK-LABEL: xvmsk_trunc_i16:
; CHECK: # %bb.0:
; CHECK-NEXT: xvslli.h $xr0, $xr0, 15
-; CHECK-NEXT: xvsrai.h $xr0, $xr0, 15
; CHECK-NEXT: xvmskltz.h $xr0, $xr0
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -631,7 +624,6 @@ define i8 @xvmsk_trunc_i32(<8 x i32> %a) {
; CHECK-LABEL: xvmsk_trunc_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: xvslli.w $xr0, $xr0, 31
-; CHECK-NEXT: xvsrai.w $xr0, $xr0, 31
; CHECK-NEXT: xvmskltz.w $xr0, $xr0
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -646,7 +638,6 @@ define i4 @xvmsk_trunc_i64(<4 x i64> %a) {
; CHECK-LABEL: xvmsk_trunc_i64:
; CHECK: # %bb.0:
; CHECK-NEXT: xvslli.d $xr0, $xr0, 63
-; CHECK-NEXT: xvsrai.d $xr0, $xr0, 63
; CHECK-NEXT: xvmskltz.d $xr0, $xr0
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
index d8098ccc9328d..0ee30120f77a6 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
@@ -181,7 +181,6 @@ define i2 @vmsk_sgt_v2i8(<2 x i8> %a, <2 x i8> %b) {
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
-; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
; CHECK-NEXT: vmskltz.d $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -197,7 +196,6 @@ define i2 @vmsk_sgt_v2i16(<2 x i16> %a, <2 x i16> %b) {
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 48
-; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
; CHECK-NEXT: vmskltz.d $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -212,7 +210,6 @@ define i2 @vmsk_sgt_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-NEXT: vslt.w $vr0, $vr1, $vr0
; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16
; CHECK-NEXT: vslli.d $vr0, $vr0, 32
-; CHECK-NEXT: vsrai.d $vr0, $vr0, 32
; CHECK-NEXT: vmskltz.d $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -252,7 +249,6 @@ define i4 @vmsk_sgt_v4i8(<4 x i8> %a, <4 x i8> %b) {
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
-; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
; CHECK-NEXT: vmskltz.w $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -267,7 +263,6 @@ define i4 @vmsk_sgt_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-NEXT: vslt.h $vr0, $vr1, $vr0
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 16
-; CHECK-NEXT: vsrai.w $vr0, $vr0, 16
; CHECK-NEXT: vmskltz.w $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -306,7 +301,6 @@ define i8 @vmsk_sgt_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-NEXT: vslt.b $vr0, $vr1, $vr0
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.h $vr0, $vr0, 8
-; CHECK-NEXT: vsrai.h $vr0, $vr0, 8
; CHECK-NEXT: vmskltz.h $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -349,7 +343,6 @@ define i2 @vmsk_sgt_and_sgt_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8>
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
-; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
; CHECK-NEXT: vmskltz.d $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -369,7 +362,6 @@ define i2 @vmsk_sgt_and_sgt_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 48
-; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
; CHECK-NEXT: vmskltz.d $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -388,7 +380,6 @@ define i2 @vmsk_sgt_and_sgt_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16
; CHECK-NEXT: vslli.d $vr0, $vr0, 32
-; CHECK-NEXT: vsrai.d $vr0, $vr0, 32
; CHECK-NEXT: vmskltz.d $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -440,7 +431,6 @@ define i4 @vmsk_sgt_and_sgt_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
-; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
; CHECK-NEXT: vmskltz.w $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -459,7 +449,6 @@ define i4 @vmsk_sgt_and_sgt_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 16
-; CHECK-NEXT: vsrai.w $vr0, $vr0, 16
; CHECK-NEXT: vmskltz.w $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -510,7 +499,6 @@ define i8 @vmsk_sgt_and_sgt_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8>
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.h $vr0, $vr0, 8
-; CHECK-NEXT: vsrai.h $vr0, $vr0, 8
; CHECK-NEXT: vmskltz.h $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -557,7 +545,6 @@ define i16 @vmsk_trunc_i8(<16 x i8> %a) {
; CHECK-LABEL: vmsk_trunc_i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vslli.b $vr0, $vr0, 7
-; CHECK-NEXT: vsrai.b $vr0, $vr0, 7
; CHECK-NEXT: vmskltz.b $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -570,7 +557,6 @@ define i8 @vmsk_trunc_i16(<8 x i16> %a) {
; CHECK-LABEL: vmsk_trunc_i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
-; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
; CHECK-NEXT: vmskltz.h $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -583,7 +569,6 @@ define i4 @vmsk_trunc_i32(<4 x i32> %a) {
; CHECK-LABEL: vmsk_trunc_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vslli.w $vr0, $vr0, 31
-; CHECK-NEXT: vsrai.w $vr0, $vr0, 31
; CHECK-NEXT: vmskltz.w $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
@@ -596,7 +581,6 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
; CHECK-LABEL: vmsk_trunc_i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vslli.d $vr0, $vr0, 63
-; CHECK-NEXT: vsrai.d $vr0, $vr0, 63
; CHECK-NEXT: vmskltz.d $vr0, $vr0
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
; CHECK-NEXT: ret
``````````
</details>
https://github.com/llvm/llvm-project/pull/143528
More information about the llvm-commits
mailing list