[llvm] 792fa23 - [AArch64][SVE2] Lower OR to SLI/SRI (#77555)

Fri Jan 12 11:24:00 PST 2024

Author: Usman Nadeem
Date: 2024-01-12T11:23:56-08:00
New Revision: 792fa23c1bd0df92f4835f50300399c5db2d80b4

URL: https://github.com/llvm/llvm-project/commit/792fa23c1bd0df92f4835f50300399c5db2d80b4
DIFF: https://github.com/llvm/llvm-project/commit/792fa23c1bd0df92f4835f50300399c5db2d80b4.diff

LOG: [AArch64][SVE2] Lower OR to SLI/SRI (#77555)

Code builds on NEON code and the tests are adapted from NEON tests
minus the tests for illegal types.

Added: 
    llvm/test/CodeGen/AArch64/sve2-sli-sri.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e2d07a09649680..91b36161ab46e8 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1358,6 +1358,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
       if (!Subtarget->isLittleEndian())
         setOperationAction(ISD::BITCAST, VT, Expand);
+
+      if (Subtarget->hasSVE2orSME())
+        // For SLI/SRI.
+        setOperationAction(ISD::OR, VT, Custom);
     }
 
     // Illegal unpacked integer vector types.
@@ -5409,7 +5413,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 
   case Intrinsic::aarch64_neon_vsri:
-  case Intrinsic::aarch64_neon_vsli: {
+  case Intrinsic::aarch64_neon_vsli:
+  case Intrinsic::aarch64_sve_sri:
+  case Intrinsic::aarch64_sve_sli: {
     EVT Ty = Op.getValueType();
 
     if (!Ty.isVector())
@@ -5417,7 +5423,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
 
-    bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
+    bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
+                        IntNo == Intrinsic::aarch64_sve_sri;
     unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
     return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
                        Op.getOperand(3));
@@ -12542,6 +12549,53 @@ static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
   return true;
 }
 
+static bool isAllInactivePredicate(SDValue N) {
+  // Look through cast.
+  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
+    N = N.getOperand(0);
+
+  return ISD::isConstantSplatVectorAllZeros(N.getNode());
+}
+
+static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
+  unsigned NumElts = N.getValueType().getVectorMinNumElements();
+
+  // Look through cast.
+  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
+    N = N.getOperand(0);
+    // When reinterpreting from a type with fewer elements the "new" elements
+    // are not active, so bail if they're likely to be used.
+    if (N.getValueType().getVectorMinNumElements() < NumElts)
+      return false;
+  }
+
+  if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
+    return true;
+
+  // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
+  // or smaller than the implicit element type represented by N.
+  // NOTE: A larger element count implies a smaller element type.
+  if (N.getOpcode() == AArch64ISD::PTRUE &&
+      N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
+    return N.getValueType().getVectorMinNumElements() >= NumElts;
+
+  // If we're compiling for a specific vector-length, we can check if the
+  // pattern's VL equals that of the scalable vector at runtime.
+  if (N.getOpcode() == AArch64ISD::PTRUE) {
+    const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+    unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
+    unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
+    if (MaxSVESize && MinSVESize == MaxSVESize) {
+      unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
+      unsigned PatNumElts =
+          getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
+      return PatNumElts == (NumElts * VScale);
+    }
+  }
+
+  return false;
+}
+
 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
@@ -12567,59 +12621,78 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   // Is one of the operands an AND or a BICi? The AND may have been optimised to
   // a BICi in order to use an immediate instead of a register.
   // Is the other operand an shl or lshr? This will have been turned into:
-  // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
+  // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
+  // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
   if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
-      (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
+      (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
+       SecondOpc == AArch64ISD::SHL_PRED ||
+       SecondOpc == AArch64ISD::SRL_PRED)) {
     And = FirstOp;
     Shift = SecondOp;
 
   } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
-             (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
+             (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
+              FirstOpc == AArch64ISD::SHL_PRED ||
+              FirstOpc == AArch64ISD::SRL_PRED)) {
     And = SecondOp;
     Shift = FirstOp;
   } else
     return SDValue();
 
   bool IsAnd = And.getOpcode() == ISD::AND;
-  bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
-
-  // Is the shift amount constant?
-  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
-  if (!C2node)
+  bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
+                      Shift.getOpcode() == AArch64ISD::SRL_PRED;
+  bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
+                        Shift.getOpcode() == AArch64ISD::SRL_PRED;
+
+  // Is the shift amount constant and are all lanes active?
+  uint64_t C2;
+  if (ShiftHasPredOp) {
+    if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
+      return SDValue();
+    APInt C;
+    if (!ISD::isConstantSplatVector(Shift.getOperand(2).getNode(), C))
+      return SDValue();
+    C2 = C.getZExtValue();
+  } else if (ConstantSDNode *C2node =
+                 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
+    C2 = C2node->getZExtValue();
+  else
     return SDValue();
 
-  uint64_t C1;
+  APInt C1AsAPInt;
+  unsigned ElemSizeInBits = VT.getScalarSizeInBits();
   if (IsAnd) {
     // Is the and mask vector all constant?
-    if (!isAllConstantBuildVector(And.getOperand(1), C1))
+    if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
       return SDValue();
   } else {
     // Reconstruct the corresponding AND immediate from the two BICi immediates.
     ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
     ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
     assert(C1nodeImm && C1nodeShift);
-    C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
+    C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
+    C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
   }
 
   // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
   // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
   // how much one can shift elements of a particular size?
-  uint64_t C2 = C2node->getZExtValue();
-  unsigned ElemSizeInBits = VT.getScalarSizeInBits();
   if (C2 > ElemSizeInBits)
     return SDValue();
 
-  APInt C1AsAPInt(ElemSizeInBits, C1);
   APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
                                   : APInt::getLowBitsSet(ElemSizeInBits, C2);
   if (C1AsAPInt != RequiredC1)
     return SDValue();
 
   SDValue X = And.getOperand(0);
-  SDValue Y = Shift.getOperand(0);
+  SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
+  SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
+                               : Shift.getOperand(1);
 
   unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
-  SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
+  SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
 
   LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
   LLVM_DEBUG(N->dump(&DAG));
@@ -12641,6 +12714,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
     return Res;
 
   EVT VT = Op.getValueType();
+  if (VT.isScalableVector())
+    return Op;
 
   SDValue LHS = Op.getOperand(0);
   BuildVectorSDNode *BVN =
@@ -17432,53 +17507,6 @@ static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
   return false;
 }
 
-static bool isAllInactivePredicate(SDValue N) {
-  // Look through cast.
-  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
-    N = N.getOperand(0);
-
-  return ISD::isConstantSplatVectorAllZeros(N.getNode());
-}
-
-static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
-  unsigned NumElts = N.getValueType().getVectorMinNumElements();
-
-  // Look through cast.
-  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
-    N = N.getOperand(0);
-    // When reinterpreting from a type with fewer elements the "new" elements
-    // are not active, so bail if they're likely to be used.
-    if (N.getValueType().getVectorMinNumElements() < NumElts)
-      return false;
-  }
-
-  if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
-    return true;
-
-  // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
-  // or smaller than the implicit element type represented by N.
-  // NOTE: A larger element count implies a smaller element type.
-  if (N.getOpcode() == AArch64ISD::PTRUE &&
-      N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
-    return N.getValueType().getVectorMinNumElements() >= NumElts;
-
-  // If we're compiling for a specific vector-length, we can check if the
-  // pattern's VL equals that of the scalable vector at runtime.
-  if (N.getOpcode() == AArch64ISD::PTRUE) {
-    const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
-    unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
-    unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
-    if (MaxSVESize && MinSVESize == MaxSVESize) {
-      unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
-      unsigned PatNumElts =
-          getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
-      return PatNumElts == (NumElts * VScale);
-    }
-  }
-
-  return false;
-}
-
 static SDValue performReinterpretCastCombine(SDNode *N) {
   SDValue LeafOp = SDValue(N, 0);
   SDValue Op = N->getOperand(0);

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4782ad076c605d..c4d69232c9e30e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3577,8 +3577,8 @@ let Predicates = [HasSVE2orSME] in {
   defm PMULLT_ZZZ   : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>;
 
   // SVE2 bitwise shift and insert
-  defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>;
-  defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;
+  defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", AArch64vsri>;
+  defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", AArch64vsli>;
 
   // SVE2 bitwise shift right and accumulate
   defm SSRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b00, "ssra",  AArch64ssra>;

diff  --git a/llvm/test/CodeGen/AArch64/sve2-sli-sri.ll b/llvm/test/CodeGen/AArch64/sve2-sli-sri.ll
new file mode 100644
index 00000000000000..80999fb1f4864b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-sli-sri.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=CHECK,SVE %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s -o - | FileCheck --check-prefixes=CHECK,SVE2 %s
+
+define <vscale x 16 x i8> @testLeftGood16x8(<vscale x 16 x i8> %src1, <vscale x 16 x i8> %src2) {
+; SVE-LABEL: testLeftGood16x8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.b, z0.b, #0x7
+; SVE-NEXT:    lsl z1.b, z1.b, #3
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testLeftGood16x8:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sli z0.b, z1.b, #3
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 16 x i8> %src1, splat(i8 7)
+  %vshl_n = shl <vscale x 16 x i8> %src2, splat(i8 3)
+  %result = or <vscale x 16 x i8> %and.i, %vshl_n
+  ret <vscale x 16 x i8> %result
+}
+
+define <vscale x 16 x i8> @testLeftBad16x8(<vscale x 16 x i8> %src1, <vscale x 16 x i8> %src2) {
+; CHECK-LABEL: testLeftBad16x8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #-91 // =0xffffffffffffffa5
+; CHECK-NEXT:    lsl z1.b, z1.b, #1
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 16 x i8> %src1, splat(i8 165)
+  %vshl_n = shl <vscale x 16 x i8> %src2, splat(i8 1)
+  %result = or <vscale x 16 x i8> %and.i, %vshl_n
+  ret <vscale x 16 x i8> %result
+}
+
+define <vscale x 16 x i8> @testRightGood16x8(<vscale x 16 x i8> %src1, <vscale x 16 x i8> %src2) {
+; SVE-LABEL: testRightGood16x8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.b, z0.b, #0xe0
+; SVE-NEXT:    lsr z1.b, z1.b, #3
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testRightGood16x8:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sri z0.b, z1.b, #3
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 16 x i8> %src1, splat(i8 224)
+  %vshl_n = lshr <vscale x 16 x i8> %src2, splat(i8 3)
+  %result = or <vscale x 16 x i8> %and.i, %vshl_n
+  ret <vscale x 16 x i8> %result
+}
+
+define <vscale x 16 x i8> @testRightBad16x8(<vscale x 16 x i8> %src1, <vscale x 16 x i8> %src2) {
+; CHECK-LABEL: testRightBad16x8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #-91 // =0xffffffffffffffa5
+; CHECK-NEXT:    lsr z1.b, z1.b, #1
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 16 x i8> %src1, splat(i8 165)
+  %vshl_n = lshr <vscale x 16 x i8> %src2, splat(i8 1)
+  %result = or <vscale x 16 x i8> %and.i, %vshl_n
+  ret <vscale x 16 x i8> %result
+}
+
+define <vscale x 8 x i16> @testLeftGood8x16(<vscale x 8 x i16> %src1, <vscale x 8 x i16> %src2) {
+; SVE-LABEL: testLeftGood8x16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.h, z0.h, #0x3fff
+; SVE-NEXT:    lsl z1.h, z1.h, #14
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testLeftGood8x16:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sli z0.h, z1.h, #14
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 8 x i16> %src1, splat(i16 16383)
+  %vshl_n = shl <vscale x 8 x i16> %src2, splat(i16 14)
+  %result = or <vscale x 8 x i16> %and.i, %vshl_n
+  ret <vscale x 8 x i16> %result
+}
+
+define <vscale x 8 x i16> @testLeftBad8x16(<vscale x 8 x i16> %src1, <vscale x 8 x i16> %src2) {
+; CHECK-LABEL: testLeftBad8x16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #16500 // =0x4074
+; CHECK-NEXT:    lsl z1.h, z1.h, #14
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 8 x i16> %src1, splat(i16 16500)
+  %vshl_n = shl <vscale x 8 x i16> %src2, splat(i16 14)
+  %result = or <vscale x 8 x i16> %and.i, %vshl_n
+  ret <vscale x 8 x i16> %result
+}
+
+define <vscale x 8 x i16> @testRightGood8x16(<vscale x 8 x i16> %src1, <vscale x 8 x i16> %src2) {
+; SVE-LABEL: testRightGood8x16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.h, z0.h, #0xfffc
+; SVE-NEXT:    lsr z1.h, z1.h, #14
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testRightGood8x16:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sri z0.h, z1.h, #14
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 8 x i16> %src1, splat(i16 65532)
+  %vshl_n = lshr <vscale x 8 x i16> %src2, splat(i16 14)
+  %result = or <vscale x 8 x i16> %and.i, %vshl_n
+  ret <vscale x 8 x i16> %result
+}
+
+define <vscale x 8 x i16> @testRightBad8x16(<vscale x 8 x i16> %src1, <vscale x 8 x i16> %src2) {
+; CHECK-LABEL: testRightBad8x16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #16500 // =0x4074
+; CHECK-NEXT:    lsr z1.h, z1.h, #14
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 8 x i16> %src1, splat(i16 16500)
+  %vshl_n = lshr <vscale x 8 x i16> %src2, splat(i16 14)
+  %result = or <vscale x 8 x i16> %and.i, %vshl_n
+  ret <vscale x 8 x i16> %result
+}
+
+define <vscale x 4 x i32> @testLeftGood4x32(<vscale x 4 x i32> %src1, <vscale x 4 x i32> %src2) {
+; SVE-LABEL: testLeftGood4x32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.s, z0.s, #0x3fffff
+; SVE-NEXT:    lsl z1.s, z1.s, #22
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testLeftGood4x32:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sli z0.s, z1.s, #22
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 4 x i32> %src1, splat(i32 4194303)
+  %vshl_n = shl <vscale x 4 x i32> %src2, splat(i32 22)
+  %result = or <vscale x 4 x i32> %and.i, %vshl_n
+  ret <vscale x 4 x i32> %result
+}
+
+define <vscale x 4 x i32> @testLeftBad4x32(<vscale x 4 x i32> %src1, <vscale x 4 x i32> %src2) {
+; CHECK-LABEL: testLeftBad4x32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.s, z0.s, #0x3ffffc
+; CHECK-NEXT:    lsl z1.s, z1.s, #22
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 4 x i32> %src1, splat(i32 4194300)
+  %vshl_n = shl <vscale x 4 x i32> %src2, splat(i32 22)
+  %result = or <vscale x 4 x i32> %and.i, %vshl_n
+  ret <vscale x 4 x i32> %result
+}
+
+define <vscale x 4 x i32> @testRightGood4x32(<vscale x 4 x i32> %src1, <vscale x 4 x i32> %src2) {
+; SVE-LABEL: testRightGood4x32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.s, z0.s, #0xfffffc00
+; SVE-NEXT:    lsr z1.s, z1.s, #22
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testRightGood4x32:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sri z0.s, z1.s, #22
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 4 x i32> %src1, splat(i32 4294966272)
+  %vshl_n = lshr <vscale x 4 x i32> %src2, splat(i32 22)
+  %result = or <vscale x 4 x i32> %and.i, %vshl_n
+  ret <vscale x 4 x i32> %result
+}
+
+define <vscale x 4 x i32> @testRightBad4x32(<vscale x 4 x i32> %src1, <vscale x 4 x i32> %src2) {
+; CHECK-LABEL: testRightBad4x32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.s, z0.s, #0x3ffffc
+; CHECK-NEXT:    lsr z1.s, z1.s, #22
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 4 x i32> %src1, splat(i32 4194300)
+  %vshl_n = lshr <vscale x 4 x i32> %src2, splat(i32 22)
+  %result = or <vscale x 4 x i32> %and.i, %vshl_n
+  ret <vscale x 4 x i32> %result
+}
+
+define <vscale x 2 x i64> @testLeftGood2x64(<vscale x 2 x i64> %src1, <vscale x 2 x i64> %src2) {
+; SVE-LABEL: testLeftGood2x64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.d, z0.d, #0xffffffffffff
+; SVE-NEXT:    lsl z1.d, z1.d, #48
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testLeftGood2x64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sli z0.d, z1.d, #48
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 2 x i64> %src1, splat(i64 281474976710655)
+  %vshl_n = shl <vscale x 2 x i64> %src2, splat(i64 48)
+  %result = or <vscale x 2 x i64> %and.i, %vshl_n
+  ret <vscale x 2 x i64> %result
+}
+
+define <vscale x 2 x i64> @testLeftBad2x64(<vscale x 2 x i64> %src1, <vscale x 2 x i64> %src2) {
+; CHECK-LABEL: testLeftBad2x64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #10 // =0xa
+; CHECK-NEXT:    lsl z1.d, z1.d, #48
+; CHECK-NEXT:    movk x8, #1, lsl #48
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 2 x i64> %src1, splat(i64 281474976710666)
+  %vshl_n = shl <vscale x 2 x i64> %src2, splat(i64 48)
+  %result = or <vscale x 2 x i64> %and.i, %vshl_n
+  ret <vscale x 2 x i64> %result
+}
+
+define <vscale x 2 x i64> @testRightGood2x64(<vscale x 2 x i64> %src1, <vscale x 2 x i64> %src2) {
+; SVE-LABEL: testRightGood2x64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    and z0.d, z0.d, #0xffffffffffff0000
+; SVE-NEXT:    lsr z1.d, z1.d, #48
+; SVE-NEXT:    orr z0.d, z0.d, z1.d
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: testRightGood2x64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    sri z0.d, z1.d, #48
+; SVE2-NEXT:    ret
+  %and.i = and <vscale x 2 x i64> %src1, splat(i64 18446744073709486080)
+  %vshl_n = lshr <vscale x 2 x i64> %src2, splat(i64 48)
+  %result = or <vscale x 2 x i64> %and.i, %vshl_n
+  ret <vscale x 2 x i64> %result
+}
+
+define <vscale x 2 x i64> @testRightBad2x64(<vscale x 2 x i64> %src1, <vscale x 2 x i64> %src2) {
+; CHECK-LABEL: testRightBad2x64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #10 // =0xa
+; CHECK-NEXT:    lsr z1.d, z1.d, #48
+; CHECK-NEXT:    movk x8, #1, lsl #48
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %and.i = and <vscale x 2 x i64> %src1, splat(i64 281474976710666)
+  %vshl_n = lshr <vscale x 2 x i64> %src2, splat(i64 48)
+  %result = or <vscale x 2 x i64> %and.i, %vshl_n
+  ret <vscale x 2 x i64> %result
+}