[llvm] 1db7b9c - [ARM] Make a BE predicate bitcast consistent with the rest of llvm

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 11 01:00:05 PST 2021


Author: David Green
Date: 2021-02-11T08:59:52Z
New Revision: 1db7b9ceaae43ab3c44ac96e238ec078538204ea

URL: https://github.com/llvm/llvm-project/commit/1db7b9ceaae43ab3c44ac96e238ec078538204ea
DIFF: https://github.com/llvm/llvm-project/commit/1db7b9ceaae43ab3c44ac96e238ec078538204ea.diff

LOG: [ARM] Make a BE predicate bitcast consistent with the rest of llvm

We were storing predicate registers, such as a <8 x i1>, in the opposite
order to how the rest of llvm expects. This actually turns out to be
correct for the one place that usually uses it - the
ScalarizeMaskedMemIntrin pass, but only because the pass was incorrect
itself. This fixes the order so that bits are stored in the opposite
order and bitcasts work as expected. This allows the Scalarization pass
to be fixed, as in https://reviews.llvm.org/D94765.

Differential Revision: https://reviews.llvm.org/D94867

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/test/CodeGen/Thumb2/mve-masked-load.ll
    llvm/test/CodeGen/Thumb2/mve-masked-store.ll
    llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
    llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 1fde0df2770c..bc4dbd9e1dd5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -9451,13 +9451,20 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
   // the bottom bits of the predicate.
   // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
   // for BE).
+  // Speaking of BE, apparently the rest of llvm will assume a reverse order to
+  // a natural VMSR(load), so needs to be reversed.
 
   SDLoc dl(Op);
   SDValue Load = DAG.getExtLoad(
       ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
       LD->getMemOperand());
-  SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
+  SDValue Val = Load;
+  if (DAG.getDataLayout().isBigEndian())
+    Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
+                      DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
+                      DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
+  SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
   if (MemVT != MVT::v16i1)
     Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
                        DAG.getConstant(0, dl, MVT::i32));
@@ -9498,14 +9505,22 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
   SDValue Build = ST->getValue();
   if (MemVT != MVT::v16i1) {
     SmallVector<SDValue, 16> Ops;
-    for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
+    for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
+      unsigned Elt = DAG.getDataLayout().isBigEndian()
+                         ? MemVT.getVectorNumElements() - I - 1
+                         : I;
       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
-                                DAG.getConstant(I, dl, MVT::i32)));
+                                DAG.getConstant(Elt, dl, MVT::i32)));
+    }
     for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
       Ops.push_back(DAG.getUNDEF(MVT::i32));
     Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
   }
   SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
+  if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
+    GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
+                      DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
+                      DAG.getConstant(16, dl, MVT::i32));
   return DAG.getTruncStore(
       ST->getChain(), dl, GRP, ST->getBasePtr(),
       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index b5f9d6676532..7ec6a24b5f6a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -816,102 +816,199 @@ entry:
 }
 
 define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%src) {
-; CHECK-LABEL: foo_v4f32_v4f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    vldrh.s32 q0, [r1]
-; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs lr, p0
-; CHECK-NEXT:    and r1, lr, #1
-; CHECK-NEXT:    ubfx r3, lr, #4, #1
-; CHECK-NEXT:    rsb.w r12, r1, #0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r12, #0, #1
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, lr, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    ubfx r3, lr, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #3, #1
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    bne .LBB18_6
-; CHECK-NEXT:  @ %bb.1: @ %else
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bmi .LBB18_7
-; CHECK-NEXT:  .LBB18_2: @ %else2
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bmi .LBB18_8
-; CHECK-NEXT:  .LBB18_3: @ %else5
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    bpl .LBB18_5
-; CHECK-NEXT:  .LBB18_4: @ %cond.load7
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:  .LBB18_5: @ %else8
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s4
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s5
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s6
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s7
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:  .LBB18_6: @ %cond.load
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bpl .LBB18_2
-; CHECK-NEXT:  .LBB18_7: @ %cond.load1
-; CHECK-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bpl .LBB18_3
-; CHECK-NEXT:  .LBB18_8: @ %cond.load4
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    bmi .LBB18_4
-; CHECK-NEXT:    b .LBB18_5
+; CHECK-LE-LABEL: foo_v4f32_v4f16:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
+; CHECK-LE-NEXT:    .pad #8
+; CHECK-LE-NEXT:    sub sp, #8
+; CHECK-LE-NEXT:    vldrh.s32 q0, [r1]
+; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    vmrs lr, p0
+; CHECK-LE-NEXT:    and r1, lr, #1
+; CHECK-LE-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-LE-NEXT:    rsb.w r12, r1, #0
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r12, #0, #1
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, lr, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, lr, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #3, #1
+; CHECK-LE-NEXT:    lsls r3, r1, #31
+; CHECK-LE-NEXT:    bne .LBB18_6
+; CHECK-LE-NEXT:  @ %bb.1: @ %else
+; CHECK-LE-NEXT:    lsls r3, r1, #30
+; CHECK-LE-NEXT:    bmi .LBB18_7
+; CHECK-LE-NEXT:  .LBB18_2: @ %else2
+; CHECK-LE-NEXT:    lsls r3, r1, #29
+; CHECK-LE-NEXT:    bmi .LBB18_8
+; CHECK-LE-NEXT:  .LBB18_3: @ %else5
+; CHECK-LE-NEXT:    lsls r1, r1, #28
+; CHECK-LE-NEXT:    bpl .LBB18_5
+; CHECK-LE-NEXT:  .LBB18_4: @ %cond.load7
+; CHECK-LE-NEXT:    vmovx.f16 s4, s0
+; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
+; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:  .LBB18_5: @ %else8
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s7, s1
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s6, s1
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s5, s0
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    itt ne
+; CHECK-LE-NEXT:    vmovne r2, s4
+; CHECK-LE-NEXT:    strne r2, [r0]
+; CHECK-LE-NEXT:    lsls r2, r1, #30
+; CHECK-LE-NEXT:    itt mi
+; CHECK-LE-NEXT:    vmovmi r2, s5
+; CHECK-LE-NEXT:    strmi r2, [r0, #4]
+; CHECK-LE-NEXT:    lsls r2, r1, #29
+; CHECK-LE-NEXT:    itt mi
+; CHECK-LE-NEXT:    vmovmi r2, s6
+; CHECK-LE-NEXT:    strmi r2, [r0, #8]
+; CHECK-LE-NEXT:    lsls r1, r1, #28
+; CHECK-LE-NEXT:    itt mi
+; CHECK-LE-NEXT:    vmovmi r1, s7
+; CHECK-LE-NEXT:    strmi r1, [r0, #12]
+; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    pop {r7, pc}
+; CHECK-LE-NEXT:  .LBB18_6: @ %cond.load
+; CHECK-LE-NEXT:    vldr.16 s0, [r2]
+; CHECK-LE-NEXT:    lsls r3, r1, #30
+; CHECK-LE-NEXT:    bpl .LBB18_2
+; CHECK-LE-NEXT:  .LBB18_7: @ %cond.load1
+; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
+; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vmovx.f16 s4, s1
+; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    lsls r3, r1, #29
+; CHECK-LE-NEXT:    bpl .LBB18_3
+; CHECK-LE-NEXT:  .LBB18_8: @ %cond.load4
+; CHECK-LE-NEXT:    vmovx.f16 s4, s0
+; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vmovx.f16 s4, s1
+; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
+; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    lsls r1, r1, #28
+; CHECK-LE-NEXT:    bmi .LBB18_4
+; CHECK-LE-NEXT:    b .LBB18_5
+;
+; CHECK-BE-LABEL: foo_v4f32_v4f16:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r7, lr}
+; CHECK-BE-NEXT:    push {r7, lr}
+; CHECK-BE-NEXT:    .pad #8
+; CHECK-BE-NEXT:    sub sp, #8
+; CHECK-BE-NEXT:    vldrh.s32 q0, [r1]
+; CHECK-BE-NEXT:    vcmp.s32 gt, q0, zr
+; CHECK-BE-NEXT:    @ implicit-def: $q0
+; CHECK-BE-NEXT:    vmrs lr, p0
+; CHECK-BE-NEXT:    ubfx r1, lr, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, lr, #8, #1
+; CHECK-BE-NEXT:    rsb.w r12, r1, #0
+; CHECK-BE-NEXT:    movs r1, #0
+; CHECK-BE-NEXT:    bfi r1, r12, #0, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    and r3, lr, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #3, #1
+; CHECK-BE-NEXT:    lsls r3, r1, #31
+; CHECK-BE-NEXT:    bne .LBB18_6
+; CHECK-BE-NEXT:  @ %bb.1: @ %else
+; CHECK-BE-NEXT:    lsls r3, r1, #30
+; CHECK-BE-NEXT:    bmi .LBB18_7
+; CHECK-BE-NEXT:  .LBB18_2: @ %else2
+; CHECK-BE-NEXT:    lsls r3, r1, #29
+; CHECK-BE-NEXT:    bmi .LBB18_8
+; CHECK-BE-NEXT:  .LBB18_3: @ %else5
+; CHECK-BE-NEXT:    lsls r1, r1, #28
+; CHECK-BE-NEXT:    bpl .LBB18_5
+; CHECK-BE-NEXT:  .LBB18_4: @ %cond.load7
+; CHECK-BE-NEXT:    vmovx.f16 s4, s0
+; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
+; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:  .LBB18_5: @ %else8
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    movs r1, #0
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s7, s1
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s6, s1
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s5, s0
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    itt ne
+; CHECK-BE-NEXT:    vmovne r2, s4
+; CHECK-BE-NEXT:    strne r2, [r0]
+; CHECK-BE-NEXT:    lsls r2, r1, #30
+; CHECK-BE-NEXT:    itt mi
+; CHECK-BE-NEXT:    vmovmi r2, s5
+; CHECK-BE-NEXT:    strmi r2, [r0, #4]
+; CHECK-BE-NEXT:    lsls r2, r1, #29
+; CHECK-BE-NEXT:    itt mi
+; CHECK-BE-NEXT:    vmovmi r2, s6
+; CHECK-BE-NEXT:    strmi r2, [r0, #8]
+; CHECK-BE-NEXT:    lsls r1, r1, #28
+; CHECK-BE-NEXT:    itt mi
+; CHECK-BE-NEXT:    vmovmi r1, s7
+; CHECK-BE-NEXT:    strmi r1, [r0, #12]
+; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    pop {r7, pc}
+; CHECK-BE-NEXT:  .LBB18_6: @ %cond.load
+; CHECK-BE-NEXT:    vldr.16 s0, [r2]
+; CHECK-BE-NEXT:    lsls r3, r1, #30
+; CHECK-BE-NEXT:    bpl .LBB18_2
+; CHECK-BE-NEXT:  .LBB18_7: @ %cond.load1
+; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
+; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vmovx.f16 s4, s1
+; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    lsls r3, r1, #29
+; CHECK-BE-NEXT:    bpl .LBB18_3
+; CHECK-BE-NEXT:  .LBB18_8: @ %cond.load4
+; CHECK-BE-NEXT:    vmovx.f16 s4, s0
+; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vmovx.f16 s4, s1
+; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
+; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    lsls r1, r1, #28
+; CHECK-BE-NEXT:    bmi .LBB18_4
+; CHECK-BE-NEXT:    b .LBB18_5
 entry:
   %0 = load <4 x i16>, <4 x i16>* %mask, align 2
   %1 = icmp sgt <4 x i16> %0, zeroinitializer
@@ -922,102 +1019,199 @@ entry:
 }
 
 define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%src) {
-; CHECK-LABEL: foo_v4f32_v4f16_unaligned:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    vldrh.s32 q0, [r1]
-; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    @ implicit-def: $q0
-; CHECK-NEXT:    vmrs lr, p0
-; CHECK-NEXT:    and r1, lr, #1
-; CHECK-NEXT:    ubfx r3, lr, #4, #1
-; CHECK-NEXT:    rsb.w r12, r1, #0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r12, #0, #1
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, lr, #8, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    ubfx r3, lr, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #3, #1
-; CHECK-NEXT:    lsls r3, r1, #31
-; CHECK-NEXT:    bne .LBB19_6
-; CHECK-NEXT:  @ %bb.1: @ %else
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bmi .LBB19_7
-; CHECK-NEXT:  .LBB19_2: @ %else2
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bmi .LBB19_8
-; CHECK-NEXT:  .LBB19_3: @ %else5
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    bpl .LBB19_5
-; CHECK-NEXT:  .LBB19_4: @ %cond.load7
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:  .LBB19_5: @ %else8
-; CHECK-NEXT:    vmrs r2, p0
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    and r3, r2, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #0, #1
-; CHECK-NEXT:    ubfx r3, r2, #4, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #1, #1
-; CHECK-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-NEXT:    ubfx r2, r2, #12, #1
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    bfi r1, r3, #2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    bfi r1, r2, #3, #1
-; CHECK-NEXT:    lsls r2, r1, #31
-; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s4
-; CHECK-NEXT:    strne r2, [r0]
-; CHECK-NEXT:    lsls r2, r1, #30
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s5
-; CHECK-NEXT:    strmi r2, [r0, #4]
-; CHECK-NEXT:    lsls r2, r1, #29
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s6
-; CHECK-NEXT:    strmi r2, [r0, #8]
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s7
-; CHECK-NEXT:    strmi r1, [r0, #12]
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:  .LBB19_6: @ %cond.load
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    lsls r3, r1, #30
-; CHECK-NEXT:    bpl .LBB19_2
-; CHECK-NEXT:  .LBB19_7: @ %cond.load1
-; CHECK-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    lsls r3, r1, #29
-; CHECK-NEXT:    bpl .LBB19_3
-; CHECK-NEXT:  .LBB19_8: @ %cond.load4
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    lsls r1, r1, #28
-; CHECK-NEXT:    bmi .LBB19_4
-; CHECK-NEXT:    b .LBB19_5
+; CHECK-LE-LABEL: foo_v4f32_v4f16_unaligned:
+; CHECK-LE:       @ %bb.0: @ %entry
+; CHECK-LE-NEXT:    .save {r7, lr}
+; CHECK-LE-NEXT:    push {r7, lr}
+; CHECK-LE-NEXT:    .pad #8
+; CHECK-LE-NEXT:    sub sp, #8
+; CHECK-LE-NEXT:    vldrh.s32 q0, [r1]
+; CHECK-LE-NEXT:    vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    vmrs lr, p0
+; CHECK-LE-NEXT:    and r1, lr, #1
+; CHECK-LE-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-LE-NEXT:    rsb.w r12, r1, #0
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r12, #0, #1
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, lr, #8, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    ubfx r3, lr, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #3, #1
+; CHECK-LE-NEXT:    lsls r3, r1, #31
+; CHECK-LE-NEXT:    bne .LBB19_6
+; CHECK-LE-NEXT:  @ %bb.1: @ %else
+; CHECK-LE-NEXT:    lsls r3, r1, #30
+; CHECK-LE-NEXT:    bmi .LBB19_7
+; CHECK-LE-NEXT:  .LBB19_2: @ %else2
+; CHECK-LE-NEXT:    lsls r3, r1, #29
+; CHECK-LE-NEXT:    bmi .LBB19_8
+; CHECK-LE-NEXT:  .LBB19_3: @ %else5
+; CHECK-LE-NEXT:    lsls r1, r1, #28
+; CHECK-LE-NEXT:    bpl .LBB19_5
+; CHECK-LE-NEXT:  .LBB19_4: @ %cond.load7
+; CHECK-LE-NEXT:    vmovx.f16 s4, s0
+; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
+; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:  .LBB19_5: @ %else8
+; CHECK-LE-NEXT:    vmrs r2, p0
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s7, s1
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s6, s1
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s5, s0
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-LE-NEXT:    and r3, r2, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-LE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT:    rsbs r3, r3, #0
+; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-LE-NEXT:    rsbs r2, r2, #0
+; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-LE-NEXT:    lsls r2, r1, #31
+; CHECK-LE-NEXT:    itt ne
+; CHECK-LE-NEXT:    vmovne r2, s4
+; CHECK-LE-NEXT:    strne r2, [r0]
+; CHECK-LE-NEXT:    lsls r2, r1, #30
+; CHECK-LE-NEXT:    itt mi
+; CHECK-LE-NEXT:    vmovmi r2, s5
+; CHECK-LE-NEXT:    strmi r2, [r0, #4]
+; CHECK-LE-NEXT:    lsls r2, r1, #29
+; CHECK-LE-NEXT:    itt mi
+; CHECK-LE-NEXT:    vmovmi r2, s6
+; CHECK-LE-NEXT:    strmi r2, [r0, #8]
+; CHECK-LE-NEXT:    lsls r1, r1, #28
+; CHECK-LE-NEXT:    itt mi
+; CHECK-LE-NEXT:    vmovmi r1, s7
+; CHECK-LE-NEXT:    strmi r1, [r0, #12]
+; CHECK-LE-NEXT:    add sp, #8
+; CHECK-LE-NEXT:    pop {r7, pc}
+; CHECK-LE-NEXT:  .LBB19_6: @ %cond.load
+; CHECK-LE-NEXT:    vldr.16 s0, [r2]
+; CHECK-LE-NEXT:    lsls r3, r1, #30
+; CHECK-LE-NEXT:    bpl .LBB19_2
+; CHECK-LE-NEXT:  .LBB19_7: @ %cond.load1
+; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
+; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vmovx.f16 s4, s1
+; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    lsls r3, r1, #29
+; CHECK-LE-NEXT:    bpl .LBB19_3
+; CHECK-LE-NEXT:  .LBB19_8: @ %cond.load4
+; CHECK-LE-NEXT:    vmovx.f16 s4, s0
+; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vmovx.f16 s4, s1
+; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
+; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    lsls r1, r1, #28
+; CHECK-LE-NEXT:    bmi .LBB19_4
+; CHECK-LE-NEXT:    b .LBB19_5
+;
+; CHECK-BE-LABEL: foo_v4f32_v4f16_unaligned:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    .save {r7, lr}
+; CHECK-BE-NEXT:    push {r7, lr}
+; CHECK-BE-NEXT:    .pad #8
+; CHECK-BE-NEXT:    sub sp, #8
+; CHECK-BE-NEXT:    vldrh.s32 q0, [r1]
+; CHECK-BE-NEXT:    vcmp.s32 gt, q0, zr
+; CHECK-BE-NEXT:    @ implicit-def: $q0
+; CHECK-BE-NEXT:    vmrs lr, p0
+; CHECK-BE-NEXT:    ubfx r1, lr, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, lr, #8, #1
+; CHECK-BE-NEXT:    rsb.w r12, r1, #0
+; CHECK-BE-NEXT:    movs r1, #0
+; CHECK-BE-NEXT:    bfi r1, r12, #0, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, lr, #4, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    and r3, lr, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #3, #1
+; CHECK-BE-NEXT:    lsls r3, r1, #31
+; CHECK-BE-NEXT:    bne .LBB19_6
+; CHECK-BE-NEXT:  @ %bb.1: @ %else
+; CHECK-BE-NEXT:    lsls r3, r1, #30
+; CHECK-BE-NEXT:    bmi .LBB19_7
+; CHECK-BE-NEXT:  .LBB19_2: @ %else2
+; CHECK-BE-NEXT:    lsls r3, r1, #29
+; CHECK-BE-NEXT:    bmi .LBB19_8
+; CHECK-BE-NEXT:  .LBB19_3: @ %else5
+; CHECK-BE-NEXT:    lsls r1, r1, #28
+; CHECK-BE-NEXT:    bpl .LBB19_5
+; CHECK-BE-NEXT:  .LBB19_4: @ %cond.load7
+; CHECK-BE-NEXT:    vmovx.f16 s4, s0
+; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
+; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:  .LBB19_5: @ %else8
+; CHECK-BE-NEXT:    vmrs r2, p0
+; CHECK-BE-NEXT:    movs r1, #0
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s7, s1
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s6, s1
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s5, s0
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
+; CHECK-BE-NEXT:    rsbs r3, r3, #0
+; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
+; CHECK-BE-NEXT:    rsbs r2, r2, #0
+; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
+; CHECK-BE-NEXT:    lsls r2, r1, #31
+; CHECK-BE-NEXT:    itt ne
+; CHECK-BE-NEXT:    vmovne r2, s4
+; CHECK-BE-NEXT:    strne r2, [r0]
+; CHECK-BE-NEXT:    lsls r2, r1, #30
+; CHECK-BE-NEXT:    itt mi
+; CHECK-BE-NEXT:    vmovmi r2, s5
+; CHECK-BE-NEXT:    strmi r2, [r0, #4]
+; CHECK-BE-NEXT:    lsls r2, r1, #29
+; CHECK-BE-NEXT:    itt mi
+; CHECK-BE-NEXT:    vmovmi r2, s6
+; CHECK-BE-NEXT:    strmi r2, [r0, #8]
+; CHECK-BE-NEXT:    lsls r1, r1, #28
+; CHECK-BE-NEXT:    itt mi
+; CHECK-BE-NEXT:    vmovmi r1, s7
+; CHECK-BE-NEXT:    strmi r1, [r0, #12]
+; CHECK-BE-NEXT:    add sp, #8
+; CHECK-BE-NEXT:    pop {r7, pc}
+; CHECK-BE-NEXT:  .LBB19_6: @ %cond.load
+; CHECK-BE-NEXT:    vldr.16 s0, [r2]
+; CHECK-BE-NEXT:    lsls r3, r1, #30
+; CHECK-BE-NEXT:    bpl .LBB19_2
+; CHECK-BE-NEXT:  .LBB19_7: @ %cond.load1
+; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
+; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vmovx.f16 s4, s1
+; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    lsls r3, r1, #29
+; CHECK-BE-NEXT:    bpl .LBB19_3
+; CHECK-BE-NEXT:  .LBB19_8: @ %cond.load4
+; CHECK-BE-NEXT:    vmovx.f16 s4, s0
+; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vmovx.f16 s4, s1
+; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
+; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    lsls r1, r1, #28
+; CHECK-BE-NEXT:    bmi .LBB19_4
+; CHECK-BE-NEXT:    b .LBB19_5
 entry:
   %0 = load <4 x i16>, <4 x i16>* %mask, align 2
   %1 = icmp sgt <4 x i16> %0, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index f77bcac8f27c..933424ad3ee9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -90,15 +90,15 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -241,15 +241,15 @@ define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(<4 x i16> *%d
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -399,15 +399,15 @@ define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(<4 x i16> *%d
 ; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -631,27 +631,27 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r2, r1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #14, #1
 ; CHECK-BE-NEXT:    rsbs r3, r2, #0
 ; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    and r1, r1, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
@@ -1278,15 +1278,15 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -1561,27 +1561,27 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest,
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    @ implicit-def: $q1
 ; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r2, r1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #14, #1
 ; CHECK-BE-NEXT:    rsbs r3, r2, #0
 ; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    and r1, r1, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
@@ -2015,17 +2015,17 @@ define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(<4 x i16> *%dest, <4 x i32
 ; CHECK-BE-NEXT:    mov.w r12, #0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    vmrs r3, p0
-; CHECK-BE-NEXT:    and r1, r3, #1
+; CHECK-BE-NEXT:    ubfx r1, r3, #12, #1
 ; CHECK-BE-NEXT:    rsbs r2, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r3, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r3, #8, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r3, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r3, #4, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r1, r2, #2, #1
-; CHECK-BE-NEXT:    ubfx r2, r3, #12, #1
+; CHECK-BE-NEXT:    and r2, r3, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #31

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
index fca6c8048673..a2cc365fb6b6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -67,15 +67,15 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a)
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -257,27 +257,27 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a)
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s16 gt, q1, zr
 ; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r2, r1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #14, #1
 ; CHECK-BE-NEXT:    rsbs r3, r2, #0
 ; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    and r1, r1, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
@@ -528,14 +528,14 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r3, r2, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -763,27 +763,27 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r2, r1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #14, #1
 ; CHECK-BE-NEXT:    rsbs r3, r2, #0
 ; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #6, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #4, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT:    and r1, r1, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r2, r3, #6, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
@@ -1180,15 +1180,15 @@ define arm_aapcs_vfpcc void @masked_v4i16_align1(<4 x i16> *%dest, <4 x i32> %a)
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.s32 gt, q1, zr
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    ubfx r1, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -1307,17 +1307,17 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcmp.f32 s4, #0
+; CHECK-BE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r1, #1
 ; CHECK-BE-NEXT:    cmp r1, #0
-; CHECK-BE-NEXT:    vcmp.f32 s5, #0
+; CHECK-BE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-BE-NEXT:    cset r1, ne
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    and r1, r1, #1
-; CHECK-BE-NEXT:    vcmp.f32 s6, #0
+; CHECK-BE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-BE-NEXT:    rsb.w r3, r1, #0
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
@@ -1328,7 +1328,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    cset r3, ne
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    and r3, r3, #1
-; CHECK-BE-NEXT:    vcmp.f32 s7, #0
+; CHECK-BE-NEXT:    vcmp.f32 s4, #0
 ; CHECK-BE-NEXT:    rsb.w r3, r3, #0
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
@@ -1479,17 +1479,17 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcmp.f32 s4, #0
+; CHECK-BE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r1, #1
 ; CHECK-BE-NEXT:    cmp r1, #0
-; CHECK-BE-NEXT:    vcmp.f32 s5, #0
+; CHECK-BE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-BE-NEXT:    cset r1, ne
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    and r1, r1, #1
-; CHECK-BE-NEXT:    vcmp.f32 s6, #0
+; CHECK-BE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-BE-NEXT:    rsb.w r3, r1, #0
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
@@ -1500,7 +1500,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    cset r3, ne
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    and r3, r3, #1
-; CHECK-BE-NEXT:    vcmp.f32 s7, #0
+; CHECK-BE-NEXT:    vcmp.f32 s4, #0
 ; CHECK-BE-NEXT:    rsb.w r3, r3, #0
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
@@ -1659,17 +1659,17 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    sub sp, #20
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcmp.f32 s4, #0
+; CHECK-BE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-BE-NEXT:    movs r2, #0
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r1, #1
 ; CHECK-BE-NEXT:    cmp r1, #0
-; CHECK-BE-NEXT:    vcmp.f32 s5, #0
+; CHECK-BE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-BE-NEXT:    cset r1, ne
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    and r1, r1, #1
-; CHECK-BE-NEXT:    vcmp.f32 s6, #0
+; CHECK-BE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-BE-NEXT:    rsb.w r3, r1, #0
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
@@ -1680,7 +1680,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    cset r3, ne
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    and r3, r3, #1
-; CHECK-BE-NEXT:    vcmp.f32 s7, #0
+; CHECK-BE-NEXT:    vcmp.f32 s4, #0
 ; CHECK-BE-NEXT:    rsb.w r3, r3, #0
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
index 5dfd5a7074e5..5f7f14fbd23e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
@@ -28,9 +28,10 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
-; CHECK-BE-NEXT:    and r0, r0, #15
+; CHECK-BE-NEXT:    rbit r0, r0
 ; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    lsrs r0, r0, #28
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q2, q1
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
@@ -90,7 +91,9 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    sub sp, #8
 ; CHECK-BE-NEXT:    uxtb r0, r0
 ; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
+; CHECK-BE-NEXT:    rbit r0, r0
 ; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    lsrs r0, r0, #24
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q2, q2, q1
 ; CHECK-BE-NEXT:    vmov.u8 r0, q2[0]
@@ -153,10 +156,13 @@ define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) {
 ; CHECK-BE-NEXT:    mov r4, sp
 ; CHECK-BE-NEXT:    bfc r4, #0, #4
 ; CHECK-BE-NEXT:    mov sp, r4
+; CHECK-BE-NEXT:    uxth r0, r0
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
+; CHECK-BE-NEXT:    rbit r0, r0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vrev32.8 q0, q0
+; CHECK-BE-NEXT:    lsrs r0, r0, #16
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
@@ -236,15 +242,15 @@ define arm_aapcs_vfpcc i4 @bitcast_from_v4i1(<4 x i32> %a) {
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i32 eq, q1, zr
 ; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r0, r1, #1
+; CHECK-BE-NEXT:    ubfx r0, r1, #12, #1
 ; CHECK-BE-NEXT:    rsbs r2, r0, #0
 ; CHECK-BE-NEXT:    movs r0, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    and r1, r1, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #2, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
@@ -300,27 +306,27 @@ define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) {
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r0, r1, #1
+; CHECK-BE-NEXT:    ubfx r0, r1, #14, #1
 ; CHECK-BE-NEXT:    rsbs r2, r0, #0
 ; CHECK-BE-NEXT:    movs r0, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #2, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #12, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #10, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #2, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #6, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #3, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #6, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #4, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #10, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #5, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #12, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #2, #1
+; CHECK-BE-NEXT:    and r1, r1, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r0, r2, #6, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
@@ -368,7 +374,8 @@ define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) {
 ; CHECK-BE-NEXT:    sub.w r4, r7, #8
 ; CHECK-BE-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-BE-NEXT:    vmrs r0, p0
-; CHECK-BE-NEXT:    uxth r0, r0
+; CHECK-BE-NEXT:    rbit r0, r0
+; CHECK-BE-NEXT:    lsrs r0, r0, #16
 ; CHECK-BE-NEXT:    mov sp, r4
 ; CHECK-BE-NEXT:    pop {r4, r6, r7, pc}
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
index a68250596711..cfecd4c5f14e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
@@ -26,6 +26,8 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    ldrb r0, [r0]
 ; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    rbit r0, r0
+; CHECK-BE-NEXT:    lsrs r0, r0, #28
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q2, q1
 ; CHECK-BE-NEXT:    vmov.u8 r0, q1[2]
@@ -80,6 +82,8 @@ define arm_aapcs_vfpcc <8 x i16> @load_v8i1(<8 x i1> *%src, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    ldrb r0, [r0]
 ; CHECK-BE-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-BE-NEXT:    vmov.i8 q2, #0xff
+; CHECK-BE-NEXT:    rbit r0, r0
+; CHECK-BE-NEXT:    lsrs r0, r0, #24
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q2, q2, q1
 ; CHECK-BE-NEXT:    vmov.u8 r0, q2[0]
@@ -125,7 +129,9 @@ define arm_aapcs_vfpcc <16 x i8> @load_v16i1(<16 x i1> *%src, <16 x i8> %a) {
 ; CHECK-BE-NEXT:    ldrh r0, [r0]
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x0
+; CHECK-BE-NEXT:    rbit r0, r0
 ; CHECK-BE-NEXT:    vrev32.8 q0, q0
+; CHECK-BE-NEXT:    lsrs r0, r0, #16
 ; CHECK-BE-NEXT:    vmsr p0, r0
 ; CHECK-BE-NEXT:    vpsel q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
@@ -195,14 +201,14 @@ define arm_aapcs_vfpcc void @store_v4i1(<4 x i1> *%dst, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    movs r3, #0
 ; CHECK-BE-NEXT:    vcmp.i32 eq, q1, zr
 ; CHECK-BE-NEXT:    vmrs r1, p0
-; CHECK-BE-NEXT:    and r2, r1, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #12, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r3, r2, #0, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r3, r2, #1, #1
-; CHECK-BE-NEXT:    ubfx r2, r1, #8, #1
-; CHECK-BE-NEXT:    ubfx r1, r1, #12, #1
+; CHECK-BE-NEXT:    ubfx r2, r1, #4, #1
+; CHECK-BE-NEXT:    and r1, r1, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
 ; CHECK-BE-NEXT:    bfi r3, r2, #2, #1
 ; CHECK-BE-NEXT:    rsbs r1, r1, #0
@@ -253,27 +259,27 @@ define arm_aapcs_vfpcc void @store_v8i1(<8 x i1> *%dst, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-BE-NEXT:    vmrs r2, p0
-; CHECK-BE-NEXT:    and r1, r2, #1
+; CHECK-BE-NEXT:    ubfx r1, r2, #14, #1
 ; CHECK-BE-NEXT:    rsbs r3, r1, #0
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #2, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #10, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #6, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #3, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #6, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #4, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #10, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #4, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #5, #1
-; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
-; CHECK-BE-NEXT:    ubfx r2, r2, #14, #1
+; CHECK-BE-NEXT:    ubfx r3, r2, #2, #1
+; CHECK-BE-NEXT:    and r2, r2, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #6, #1
 ; CHECK-BE-NEXT:    rsbs r2, r2, #0
@@ -299,6 +305,8 @@ define arm_aapcs_vfpcc void @store_v16i1(<16 x i1> *%dst, <16 x i8> %a) {
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-BE-NEXT:    vmrs r1, p0
+; CHECK-BE-NEXT:    rbit r1, r1
+; CHECK-BE-NEXT:    lsrs r1, r1, #16
 ; CHECK-BE-NEXT:    strh r1, [r0]
 ; CHECK-BE-NEXT:    bx lr
 entry:


        


More information about the llvm-commits mailing list