[llvm-branch-commits] [llvm] [LoongArch] Try to avoid casts around logical vector ops on lasx (PR #163523)
Zhaoxin Yang via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Oct 15 01:35:02 PDT 2025
https://github.com/ylzsx created https://github.com/llvm/llvm-project/pull/163523
On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which is LSX-sized register. In most cases we actually compare or select LASX-sized registers and mixing the two types creates horrible code.
>From 08c46e407e220b54aeaa2711eea4374ddb63bf32 Mon Sep 17 00:00:00 2001
From: yangzhaoxin <yangzhaoxin at loongson.cn>
Date: Wed, 15 Oct 2025 15:13:03 +0800
Subject: [PATCH] [LoongArch] Try to avoid casts around logical vector ops on
lasx
On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8,
which is LSX-sized register. In most cases we actually compare or select
LASX-sized registers and mixing the two types creates horrible code.
---
.../LoongArch/LoongArchISelLowering.cpp | 126 +++-
.../test/CodeGen/LoongArch/lasx/vxi1-masks.ll | 666 +++---------------
2 files changed, 208 insertions(+), 584 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index f7deeafc9ccfc..509ae3f0c5e1a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -466,8 +466,12 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
// Set DAG combine for 'LASX' feature.
- if (Subtarget.hasExtLASX())
+ if (Subtarget.hasExtLASX()) {
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ }
// Compute derived properties from the register classes.
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -6679,6 +6683,122 @@ performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+// or (and (truncate x, truncate y)),
+// (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+// or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants, can
+// be recursively promoted or is an existing extension we can extend further.
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
+ SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget,
+ unsigned Depth) {
+ // Limit recursion to avoid excessive compile times.
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
+ if (!ISD::isBitwiseLogicOp(N.getOpcode()))
+ return SDValue();
+
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
+ return SDValue();
+
+ if (SDValue NN0 =
+ PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
+ N0 = NN0;
+ else {
+ // The left side has to be a 'trunc'.
+ bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
+ N0.getOperand(0).getValueType() == VT;
+ if (LHSTrunc)
+ N0 = N0.getOperand(0);
+ else
+ return SDValue();
+ }
+
+ if (SDValue NN1 =
+ PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
+ N1 = NN1;
+ else {
+ // The right side has to be a 'trunc', a (foldable) constant or an
+ // existing extension we can extend further.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+ N1.getOperand(0).getValueType() == VT;
+ if (RHSTrunc)
+ N1 = N1.getOperand(0);
+ else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
+ Subtarget.hasExtLASX() && N1.hasOneUse())
+ N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
+ // On 32-bit platform, i64 is an illegal integer scalar type, and
+ // FoldConstantArithmetic will fail for v4i64. This may be optimized in the
+ // future.
+ else if (SDValue Cst =
+ DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
+ N1 = Cst;
+ else
+ return SDValue();
+ }
+
+ return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
+}
+
+// On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which
+// is LSX-sized register. In most cases we actually compare or select LASX-sized
+// registers and mixing the two types creates horrible code. This method
+// optimizes some of the transition sequences.
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
+ EVT VT = N.getValueType();
+ assert(VT.isVector() && "Expected vector type");
+ assert((N.getOpcode() == ISD::ANY_EXTEND ||
+ N.getOpcode() == ISD::ZERO_EXTEND ||
+ N.getOpcode() == ISD::SIGN_EXTEND) &&
+ "Invalid Node");
+
+ if (!Subtarget.hasExtLASX() || !VT.is256BitVector())
+ return SDValue();
+
+ SDValue Narrow = N.getOperand(0);
+ EVT NarrowVT = Narrow.getValueType();
+
+ // Generate the wide operation.
+ SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
+ if (!Op)
+ return SDValue();
+ switch (N.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case ISD::ANY_EXTEND:
+ return Op;
+ case ISD::ZERO_EXTEND:
+ return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
+ case ISD::SIGN_EXTEND:
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
+ DAG.getValueType(NarrowVT));
+ }
+}
+
+static SDValue performANY_EXTENDCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if (VT.isVector())
+ if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
+ return R;
+
+ return SDValue();
+}
+
SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -6695,6 +6815,10 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
return performSRLCombine(N, DAG, DCI, Subtarget);
case ISD::BITCAST:
return performBITCASTCombine(N, DAG, DCI, Subtarget);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ return performANY_EXTENDCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::BITREV_W:
return performBITREV_WCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::BR_CC:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll b/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
index cd98ba7e4083c..59757c27bd020 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
@@ -31,28 +31,12 @@ define void @xor_zext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
; LA64: # %bb.0:
; LA64-NEXT: xvld $xr0, $a1, 0
; LA64-NEXT: xvld $xr1, $a2, 0
+; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI0_0)
; LA64-NEXT: xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT: vldi $vr0, -1777
-; LA64-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 2
-; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 3
-; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 0
-; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 1
-; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT: xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT: xvrepli.d $xr0, 1
-; LA64-NEXT: xvand.v $xr0, $xr2, $xr0
+; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2
+; LA64-NEXT: xvrepli.d $xr1, 1
+; LA64-NEXT: xvand.v $xr0, $xr0, $xr1
; LA64-NEXT: xvst $xr0, $a0, 0
; LA64-NEXT: ret
%v0 = load <4 x double>, ptr %a
@@ -70,43 +54,10 @@ define void @xor_zext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 0
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 2
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 3
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 4
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 5
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT: vldi $vr0, -2305
-; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 3
-; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT: xvrepli.w $xr0, 1
-; CHECK-NEXT: xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT: xvldi $xr1, -1789
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvrepli.w $xr1, 1
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <8 x float>, ptr %a
@@ -124,76 +75,10 @@ define void @xor_zext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 5
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 7
-; CHECK-NEXT: xvpermi.d $xr0, $xr0, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 9
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 11
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 13
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 15
-; CHECK-NEXT: vrepli.h $vr0, 255
-; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 1
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 2
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 3
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 4
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 5
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 6
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 7
-; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT: xvrepli.h $xr0, 1
-; CHECK-NEXT: xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT: xvrepli.w $xr1, 255
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvrepli.h $xr1, 1
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <16 x i16>, ptr %a
@@ -238,22 +123,12 @@ define void @xor_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
; LA64: # %bb.0:
; LA64-NEXT: xvld $xr0, $a1, 0
; LA64-NEXT: xvld $xr1, $a2, 0
+; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI3_0)
; LA64-NEXT: xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 3
-; LA64-NEXT: vrepli.b $vr0, -1
-; LA64-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT: vpickve2gr.w $a3, $vr0, 2
-; LA64-NEXT: vinsgr2vr.d $vr1, $a3, 0
-; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 1
-; LA64-NEXT: vpickve2gr.w $a2, $vr0, 0
-; LA64-NEXT: vinsgr2vr.d $vr0, $a2, 0
-; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 1
-; LA64-NEXT: xvpermi.q $xr0, $xr1, 2
+; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2
+; LA64-NEXT: xvslli.d $xr0, $xr0, 32
+; LA64-NEXT: xvsrai.d $xr0, $xr0, 32
; LA64-NEXT: xvst $xr0, $a0, 0
; LA64-NEXT: ret
%v0 = load <4 x double>, ptr %a
@@ -266,87 +141,17 @@ define void @xor_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
}
define void @xor_sext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: xor_sext_masks_v8i32:
-; LA32: # %bb.0:
-; LA32-NEXT: xvld $xr0, $a1, 0
-; LA32-NEXT: xvld $xr1, $a2, 0
-; LA32-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 0
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 4
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 1
-; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 7
-; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 5
-; LA32-NEXT: vrepli.b $vr0, -1
-; LA32-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA32-NEXT: vpickve2gr.h $a5, $vr0, 4
-; LA32-NEXT: ext.w.h $a5, $a5
-; LA32-NEXT: vinsgr2vr.w $vr1, $a5, 0
-; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 1
-; LA32-NEXT: vpickve2gr.h $a4, $vr0, 6
-; LA32-NEXT: ext.w.h $a4, $a4
-; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 2
-; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 3
-; LA32-NEXT: vpickve2gr.h $a3, $vr0, 0
-; LA32-NEXT: ext.w.h $a3, $a3
-; LA32-NEXT: vinsgr2vr.w $vr2, $a3, 0
-; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 1
-; LA32-NEXT: vpickve2gr.h $a2, $vr0, 2
-; LA32-NEXT: ext.w.h $a2, $a2
-; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 2
-; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT: xvpermi.q $xr2, $xr1, 2
-; LA32-NEXT: xvst $xr2, $a0, 0
-; LA32-NEXT: ret
-;
-; LA64-LABEL: xor_sext_masks_v8i32:
-; LA64: # %bb.0:
-; LA64-NEXT: xvld $xr0, $a1, 0
-; LA64-NEXT: xvld $xr1, $a2, 0
-; LA64-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 0
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 4
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 1
-; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 7
-; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 5
-; LA64-NEXT: vrepli.b $vr0, -1
-; LA64-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT: vpickve2gr.h $a5, $vr0, 4
-; LA64-NEXT: ext.w.h $a5, $a5
-; LA64-NEXT: vinsgr2vr.w $vr1, $a5, 0
-; LA64-NEXT: ext.w.h $a4, $a4
-; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 1
-; LA64-NEXT: vpickve2gr.h $a4, $vr0, 6
-; LA64-NEXT: ext.w.h $a4, $a4
-; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 2
-; LA64-NEXT: ext.w.h $a3, $a3
-; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 3
-; LA64-NEXT: vpickve2gr.h $a3, $vr0, 0
-; LA64-NEXT: ext.w.h $a3, $a3
-; LA64-NEXT: vinsgr2vr.w $vr2, $a3, 0
-; LA64-NEXT: ext.w.h $a2, $a2
-; LA64-NEXT: vinsgr2vr.w $vr2, $a2, 1
-; LA64-NEXT: vpickve2gr.h $a2, $vr0, 2
-; LA64-NEXT: ext.w.h $a2, $a2
-; LA64-NEXT: vinsgr2vr.w $vr2, $a2, 2
-; LA64-NEXT: ext.w.h $a1, $a1
-; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT: xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT: xvst $xr2, $a0, 0
-; LA64-NEXT: ret
+; CHECK-LABEL: xor_sext_masks_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
+; CHECK-NEXT: xvldi $xr1, -1789
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvslli.w $xr0, $xr0, 16
+; CHECK-NEXT: xvsrai.w $xr0, $xr0, 16
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
%v0 = load <8 x float>, ptr %a
%v1 = load <8 x float>, ptr %b
%m0 = fcmp olt <8 x float> %v0, %v1
@@ -362,74 +167,11 @@ define void @xor_sext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vpickve2gr.h $a2, $vr0, 5
-; CHECK-NEXT: vpickve2gr.h $a3, $vr0, 3
-; CHECK-NEXT: vpickve2gr.h $a4, $vr0, 1
-; CHECK-NEXT: vpickve2gr.h $a5, $vr2, 7
-; CHECK-NEXT: vpickve2gr.h $a6, $vr2, 5
-; CHECK-NEXT: vpickve2gr.h $a7, $vr2, 3
-; CHECK-NEXT: vpickve2gr.h $t0, $vr2, 1
-; CHECK-NEXT: vxori.b $vr0, $vr1, 255
-; CHECK-NEXT: vpickve2gr.b $t1, $vr0, 8
-; CHECK-NEXT: ext.w.b $t1, $t1
-; CHECK-NEXT: vinsgr2vr.h $vr1, $t1, 0
-; CHECK-NEXT: ext.w.b $t0, $t0
-; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 1
-; CHECK-NEXT: vpickve2gr.b $t0, $vr0, 10
-; CHECK-NEXT: ext.w.b $t0, $t0
-; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 2
-; CHECK-NEXT: ext.w.b $a7, $a7
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a7, 3
-; CHECK-NEXT: vpickve2gr.b $a7, $vr0, 12
-; CHECK-NEXT: ext.w.b $a7, $a7
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a7, 4
-; CHECK-NEXT: ext.w.b $a6, $a6
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a6, 5
-; CHECK-NEXT: vpickve2gr.b $a6, $vr0, 14
-; CHECK-NEXT: ext.w.b $a6, $a6
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a6, 6
-; CHECK-NEXT: ext.w.b $a5, $a5
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a5, 7
-; CHECK-NEXT: vpickve2gr.b $a5, $vr0, 0
-; CHECK-NEXT: ext.w.b $a5, $a5
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a5, 0
-; CHECK-NEXT: ext.w.b $a4, $a4
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a4, 1
-; CHECK-NEXT: vpickve2gr.b $a4, $vr0, 2
-; CHECK-NEXT: ext.w.b $a4, $a4
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a4, 2
-; CHECK-NEXT: ext.w.b $a3, $a3
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a3, 3
-; CHECK-NEXT: vpickve2gr.b $a3, $vr0, 4
-; CHECK-NEXT: ext.w.b $a3, $a3
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a3, 4
-; CHECK-NEXT: ext.w.b $a2, $a2
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a2, 5
-; CHECK-NEXT: vpickve2gr.b $a2, $vr0, 6
-; CHECK-NEXT: ext.w.b $a2, $a2
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a2, 6
-; CHECK-NEXT: ext.w.b $a1, $a1
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 7
-; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT: xvst $xr2, $a0, 0
+; CHECK-NEXT: xvrepli.w $xr1, 255
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvslli.h $xr0, $xr0, 8
+; CHECK-NEXT: xvsrai.h $xr0, $xr0, 8
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <16 x i16>, ptr %a
%v1 = load <16 x i16>, ptr %b
@@ -470,28 +212,12 @@ define void @or_zext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
; LA64: # %bb.0:
; LA64-NEXT: xvld $xr0, $a1, 0
; LA64-NEXT: xvld $xr1, $a2, 0
+; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI6_0)
+; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI6_0)
; LA64-NEXT: xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT: vldi $vr0, -1777
-; LA64-NEXT: vor.v $vr0, $vr1, $vr0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 2
-; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 3
-; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 0
-; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 1
-; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT: xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT: xvrepli.d $xr0, 1
-; LA64-NEXT: xvand.v $xr0, $xr2, $xr0
+; LA64-NEXT: xvor.v $xr0, $xr0, $xr2
+; LA64-NEXT: xvrepli.d $xr1, 1
+; LA64-NEXT: xvand.v $xr0, $xr0, $xr1
; LA64-NEXT: xvst $xr0, $a0, 0
; LA64-NEXT: ret
%v0 = load <4 x double>, ptr %a
@@ -509,43 +235,10 @@ define void @or_zext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 0
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 2
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 3
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 4
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 5
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT: vldi $vr0, -2305
-; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 3
-; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT: xvrepli.w $xr0, 1
-; CHECK-NEXT: xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT: xvldi $xr1, -1789
+; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvrepli.w $xr1, 1
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <8 x float>, ptr %a
@@ -563,76 +256,10 @@ define void @or_zext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 5
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 7
-; CHECK-NEXT: xvpermi.d $xr0, $xr0, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 9
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 11
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 13
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 15
-; CHECK-NEXT: vrepli.h $vr0, 255
-; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 1
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 2
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 3
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 4
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 5
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 6
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 7
-; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT: xvrepli.h $xr0, 1
-; CHECK-NEXT: xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT: xvrepli.w $xr1, 255
+; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvrepli.h $xr1, 1
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <16 x i16>, ptr %a
@@ -677,10 +304,12 @@ define void @or_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
; LA64: # %bb.0:
; LA64-NEXT: xvld $xr0, $a1, 0
; LA64-NEXT: xvld $xr1, $a2, 0
+; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI9_0)
; LA64-NEXT: xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT: xvrepli.b $xr1, -1
-; LA64-NEXT: xvextrins.d $xr1, $xr0, 17
-; LA64-NEXT: xvst $xr1, $a0, 0
+; LA64-NEXT: xvor.v $xr0, $xr0, $xr2
+; LA64-NEXT: xvsrai.d $xr0, $xr0, 32
+; LA64-NEXT: xvst $xr0, $a0, 0
; LA64-NEXT: ret
%v0 = load <4 x double>, ptr %a
%v1 = load <4 x double>, ptr %b
@@ -692,43 +321,16 @@ define void @or_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
}
define void @or_sext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: or_sext_masks_v8i32:
-; LA32: # %bb.0:
-; LA32-NEXT: xvld $xr0, $a1, 0
-; LA32-NEXT: xvld $xr1, $a2, 0
-; LA32-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA32-NEXT: xvpickve.w $xr1, $xr0, 1
-; LA32-NEXT: xvrepli.b $xr2, -1
-; LA32-NEXT: xvinsve0.w $xr2, $xr1, 1
-; LA32-NEXT: xvpickve.w $xr1, $xr0, 3
-; LA32-NEXT: xvinsve0.w $xr2, $xr1, 3
-; LA32-NEXT: xvpickve.w $xr1, $xr0, 5
-; LA32-NEXT: xvinsve0.w $xr2, $xr1, 5
-; LA32-NEXT: xvpickve.w $xr0, $xr0, 7
-; LA32-NEXT: xvinsve0.w $xr2, $xr0, 7
-; LA32-NEXT: xvst $xr2, $a0, 0
-; LA32-NEXT: ret
-;
-; LA64-LABEL: or_sext_masks_v8i32:
-; LA64: # %bb.0:
-; LA64-NEXT: xvld $xr0, $a1, 0
-; LA64-NEXT: xvld $xr1, $a2, 0
-; LA64-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 7
-; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 5
-; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 3
-; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 1
-; LA64-NEXT: ext.w.h $a4, $a4
-; LA64-NEXT: xvrepli.b $xr0, -1
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a4, 1
-; LA64-NEXT: ext.w.h $a3, $a3
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a3, 3
-; LA64-NEXT: ext.w.h $a2, $a2
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a2, 5
-; LA64-NEXT: ext.w.h $a1, $a1
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a1, 7
-; LA64-NEXT: xvst $xr0, $a0, 0
-; LA64-NEXT: ret
+; CHECK-LABEL: or_sext_masks_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
+; CHECK-NEXT: xvldi $xr1, -1780
+; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvsrai.w $xr0, $xr0, 16
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
%v0 = load <8 x float>, ptr %a
%v1 = load <8 x float>, ptr %b
%m0 = fcmp olt <8 x float> %v0, %v1
@@ -744,48 +346,9 @@ define void @or_sext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 7
-; CHECK-NEXT: vpickve2gr.h $a2, $vr1, 5
-; CHECK-NEXT: vpickve2gr.h $a3, $vr1, 3
-; CHECK-NEXT: vpickve2gr.h $a4, $vr1, 1
-; CHECK-NEXT: vpickve2gr.h $a5, $vr0, 7
-; CHECK-NEXT: vpickve2gr.h $a6, $vr0, 5
-; CHECK-NEXT: vpickve2gr.h $a7, $vr0, 3
-; CHECK-NEXT: vpickve2gr.h $t0, $vr0, 1
-; CHECK-NEXT: ext.w.b $t0, $t0
-; CHECK-NEXT: xvrepli.b $xr0, -1
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $t0
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 17
-; CHECK-NEXT: ext.w.b $a7, $a7
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a7
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 51
-; CHECK-NEXT: ext.w.b $a6, $a6
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a6
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 85
-; CHECK-NEXT: ext.w.b $a5, $a5
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a5
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 119
-; CHECK-NEXT: ext.w.b $a4, $a4
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a4
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 17
-; CHECK-NEXT: ext.w.b $a3, $a3
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a3
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 51
-; CHECK-NEXT: ext.w.b $a2, $a2
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a2
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 85
-; CHECK-NEXT: ext.w.b $a1, $a1
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a1
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 119
+; CHECK-NEXT: xvldi $xr1, -3585
+; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvsrai.h $xr0, $xr0, 8
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <16 x i16>, ptr %a
@@ -905,10 +468,12 @@ define void @and_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
; LA64: # %bb.0:
; LA64-NEXT: xvld $xr0, $a1, 0
; LA64-NEXT: xvld $xr1, $a2, 0
+; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI15_0)
+; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI15_0)
; LA64-NEXT: xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT: xvrepli.b $xr1, 0
-; LA64-NEXT: xvextrins.d $xr1, $xr0, 0
-; LA64-NEXT: xvst $xr1, $a0, 0
+; LA64-NEXT: xvand.v $xr0, $xr0, $xr2
+; LA64-NEXT: xvsrai.d $xr0, $xr0, 32
+; LA64-NEXT: xvst $xr0, $a0, 0
; LA64-NEXT: ret
%v0 = load <4 x double>, ptr %a
%v1 = load <4 x double>, ptr %b
@@ -920,42 +485,16 @@ define void @and_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
}
define void @and_sext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: and_sext_masks_v8i32:
-; LA32: # %bb.0:
-; LA32-NEXT: xvld $xr0, $a1, 0
-; LA32-NEXT: xvld $xr1, $a2, 0
-; LA32-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA32-NEXT: xvrepli.b $xr1, 0
-; LA32-NEXT: xvinsve0.w $xr1, $xr0, 0
-; LA32-NEXT: xvpickve.w $xr2, $xr0, 2
-; LA32-NEXT: xvinsve0.w $xr1, $xr2, 2
-; LA32-NEXT: xvpickve.w $xr2, $xr0, 4
-; LA32-NEXT: xvinsve0.w $xr1, $xr2, 4
-; LA32-NEXT: xvpickve.w $xr0, $xr0, 6
-; LA32-NEXT: xvinsve0.w $xr1, $xr0, 6
-; LA32-NEXT: xvst $xr1, $a0, 0
-; LA32-NEXT: ret
-;
-; LA64-LABEL: and_sext_masks_v8i32:
-; LA64: # %bb.0:
-; LA64-NEXT: xvld $xr0, $a1, 0
-; LA64-NEXT: xvld $xr1, $a2, 0
-; LA64-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 4
-; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 2
-; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 0
-; LA64-NEXT: ext.w.h $a4, $a4
-; LA64-NEXT: xvrepli.b $xr0, 0
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a4, 0
-; LA64-NEXT: ext.w.h $a3, $a3
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a3, 2
-; LA64-NEXT: ext.w.h $a2, $a2
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a2, 4
-; LA64-NEXT: ext.w.h $a1, $a1
-; LA64-NEXT: xvinsgr2vr.w $xr0, $a1, 6
-; LA64-NEXT: xvst $xr0, $a0, 0
-; LA64-NEXT: ret
+; CHECK-LABEL: and_sext_masks_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
+; CHECK-NEXT: xvldi $xr1, -1780
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvsrai.w $xr0, $xr0, 16
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
%v0 = load <8 x float>, ptr %a
%v1 = load <8 x float>, ptr %b
%m0 = fcmp olt <8 x float> %v0, %v1
@@ -971,48 +510,9 @@ define void @and_sext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 6
-; CHECK-NEXT: vpickve2gr.h $a2, $vr1, 4
-; CHECK-NEXT: vpickve2gr.h $a3, $vr1, 2
-; CHECK-NEXT: vpickve2gr.h $a4, $vr1, 0
-; CHECK-NEXT: vpickve2gr.h $a5, $vr0, 6
-; CHECK-NEXT: vpickve2gr.h $a6, $vr0, 4
-; CHECK-NEXT: vpickve2gr.h $a7, $vr0, 2
-; CHECK-NEXT: vpickve2gr.h $t0, $vr0, 0
-; CHECK-NEXT: ext.w.b $t0, $t0
-; CHECK-NEXT: xvrepli.b $xr0, 0
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $t0
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 0
-; CHECK-NEXT: ext.w.b $a7, $a7
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a7
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 34
-; CHECK-NEXT: ext.w.b $a6, $a6
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a6
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 68
-; CHECK-NEXT: ext.w.b $a5, $a5
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a5
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 102
-; CHECK-NEXT: ext.w.b $a4, $a4
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a4
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 0
-; CHECK-NEXT: ext.w.b $a3, $a3
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a3
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 34
-; CHECK-NEXT: ext.w.b $a2, $a2
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a2
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 68
-; CHECK-NEXT: ext.w.b $a1, $a1
-; CHECK-NEXT: xvreplgr2vr.h $xr1, $a1
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.h $xr0, $xr1, 102
+; CHECK-NEXT: xvldi $xr1, -3585
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvsrai.h $xr0, $xr0, 8
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <16 x i16>, ptr %a
More information about the llvm-branch-commits
mailing list