[llvm] fc36fb4 - Revert "Second Recommit "[AArch64] Split bitmask immediate of bitwise AND operation""

Wed Oct 6 01:39:54 PDT 2021

Author: David Spickett
Date: 2021-10-06T08:39:48Z
New Revision: fc36fb4d23a5e419cf33002c87c0082f682cb77b

URL: https://github.com/llvm/llvm-project/commit/fc36fb4d23a5e419cf33002c87c0082f682cb77b
DIFF: https://github.com/llvm/llvm-project/commit/fc36fb4d23a5e419cf33002c87c0082f682cb77b.diff

LOG: Revert "Second Recommit "[AArch64] Split bitmask immediate of bitwise AND operation""

This reverts commit 13f3c39f3658fa28cb008eb56a58d8e34697cd5d.

Due to test failures in stage 2 clang tests on AArch64 bots.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64.h
    llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
    llvm/lib/Target/AArch64/CMakeLists.txt
    llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
    llvm/test/CodeGen/AArch64/O3-pipeline.ll
    llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll

Removed: 
    llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
    llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index b0dd30c13137f..658d44771e8d6 100644

--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -51,7 +51,6 @@ FunctionPass *createAArch64A53Fix835769();
 FunctionPass *createFalkorHWPFFixPass();
 FunctionPass *createFalkorMarkStridedAccessesPass();
 FunctionPass *createAArch64BranchTargetsPass();
-FunctionPass *createAArch64MIPeepholeOptPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
@@ -83,7 +82,6 @@ void initializeAArch64SLSHardeningPass(PassRegistry&);
 void initializeAArch64SpeculationHardeningPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
-void initializeAArch64MIPeepholeOptPass(PassRegistry &);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);

diff  --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
deleted file mode 100644
index d5c7791ca02ec..0000000000000
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs below peephole optimizations on MIR level.
-//
-// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-//    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
-//
-//    The mov pseudo instruction could be expanded to multiple mov instructions
-//    later. In this case, we could try to split the constant  operand of mov
-//    instruction into two bitmask immediates. It makes two AND instructions
-//    intead of multiple `mov` + `and` instructions.
-//===----------------------------------------------------------------------===//
-
-#include "AArch64ExpandImm.h"
-#include "AArch64InstrInfo.h"
-#include "MCTargetDesc/AArch64AddressingModes.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "aarch64-mi-peephole-opt"
-
-namespace {
-
-struct AArch64MIPeepholeOpt : public MachineFunctionPass {
-  static char ID;
-
-  AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
-    initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
-  }
-
-  const AArch64InstrInfo *TII;
-  MachineLoopInfo *MLI;
-  MachineRegisterInfo *MRI;
-
-  template <typename T>
-  bool visitAND(MachineInstr &MI,
-                SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override {
-    return "AArch64 MI Peephole Optimization pass";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineLoopInfo>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-char AArch64MIPeepholeOpt::ID = 0;
-
-} // end anonymous namespace
-
-INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
-                "AArch64 MI Peephole Optimization", false, false)
-
-template <typename T>
-static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
-  T UImm = static_cast<T>(Imm);
-  if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
-    return false;
-
-  // If this immediate can be handled by one instruction, do not split it.
-  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
-  AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
-  if (Insn.size() == 1)
-    return false;
-
-  // The bitmask immediate consists of consecutive ones.  Let's say there is
-  // constant 0b00000000001000000000010000000000 which does not consist of
-  // consecutive ones. We can split it in to two bitmask immediate like
-  // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
-  // If we do AND with these two bitmask immediate, we can see original one.
-  unsigned LowestBitSet = countTrailingZeros(UImm);
-  unsigned HighestBitSet = Log2_64(UImm);
-
-  // Create a mask which is filled with one from the position of lowest bit set
-  // to the position of highest bit set.
-  T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
-              (static_cast<T>(1) << LowestBitSet);
-  // Create a mask which is filled with one outside the position of lowest bit
-  // set and the position of highest bit set.
-  T NewImm2 = UImm | ~NewImm1;
-
-  // If the split value is not valid bitmask immediate, do not split this
-  // constant.
-  if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
-    return false;
-
-  Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
-  Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
-  return true;
-}
-
-template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(
-    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
-  // Try below transformation.
-  //
-  // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-  // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
-  //
-  // The mov pseudo instruction could be expanded to multiple mov instructions
-  // later. Let's try to split the constant operand of mov instruction into two
-  // bitmask immediates. It makes only two AND instructions intead of multiple
-  // mov + and instructions.
-
-  unsigned RegSize = sizeof(T) * 8;
-  assert((RegSize == 32 || RegSize == 64) &&
-         "Invalid RegSize for AND bitmask peephole optimization");
-
-  // Check whether AND's MBB is in loop and the AND is loop invariant.
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineLoop *L = MLI->getLoopFor(MBB);
-  if (L && !L->isLoopInvariant(MI))
-    return false;
-
-  // Check whether AND's operand is MOV with immediate.
-  MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
-  MachineInstr *SubregToRegMI = nullptr;
-  // If it is SUBREG_TO_REG, check its operand.
-  if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
-    SubregToRegMI = MovMI;
-    MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
-  }
-
-  if (MovMI->getOpcode() != AArch64::MOVi32imm &&
-      MovMI->getOpcode() != AArch64::MOVi64imm)
-    return false;
-
-  // If the MOV has multiple uses, do not split the immediate because it causes
-  // more instructions.
-  if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
-    return false;
-
-  if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
-    return false;
-
-  // Split the bitmask immediate into two.
-  T UImm = static_cast<T>(MovMI->getOperand(1).getImm());
-  T Imm1Enc;
-  T Imm2Enc;
-  if (!splitBitmaskImm(UImm, RegSize, Imm1Enc, Imm2Enc))
-    return false;
-
-  // Create new AND MIs.
-  DebugLoc DL = MI.getDebugLoc();
-  const TargetRegisterClass *ANDImmRC =
-      (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  Register NewTmpReg = MRI->createVirtualRegister(ANDImmRC);
-  unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri;
-
-  MRI->constrainRegClass(NewTmpReg, MRI->getRegClass(SrcReg));
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
-      .addReg(SrcReg)
-      .addImm(Imm1Enc);
-
-  MRI->constrainRegClass(DstReg, ANDImmRC);
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
-      .addReg(NewTmpReg)
-      .addImm(Imm2Enc);
-
-  ToBeRemoved.insert(&MI);
-  if (SubregToRegMI)
-    ToBeRemoved.insert(SubregToRegMI);
-  ToBeRemoved.insert(MovMI);
-
-  return true;
-}
-
-bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
-    return false;
-
-  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
-  MLI = &getAnalysis<MachineLoopInfo>();
-  MRI = &MF.getRegInfo();
-
-  if (!MRI->isSSA())
-    return false;
-
-  bool Changed = false;
-  SmallSetVector<MachineInstr *, 8> ToBeRemoved;
-
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      switch (MI.getOpcode()) {
-      default:
-        break;
-      case AArch64::ANDWrr:
-        Changed = visitAND<uint32_t>(MI, ToBeRemoved);
-        break;
-      case AArch64::ANDXrr:
-        Changed = visitAND<uint64_t>(MI, ToBeRemoved);
-        break;
-      }
-    }
-  }
-
-  for (MachineInstr *MI : ToBeRemoved)
-    MI->eraseFromParent();
-
-  return Changed;
-}
-
-FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
-  return new AArch64MIPeepholeOpt();
-}

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 5d26b6d41b4c5..6127f890118f9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -195,7 +195,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   initializeAArch64DeadRegisterDefinitionsPass(*PR);
   initializeAArch64ExpandPseudoPass(*PR);
   initializeAArch64LoadStoreOptPass(*PR);
-  initializeAArch64MIPeepholeOptPass(*PR);
   initializeAArch64SIMDInstrOptPass(*PR);
   initializeAArch64O0PreLegalizerCombinerPass(*PR);
   initializeAArch64PreLegalizerCombinerPass(*PR);
@@ -481,7 +480,6 @@ class AArch64PassConfig : public TargetPassConfig {
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
-  void addMachineSSAOptimization() override;
   bool addILPOpts() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -658,14 +656,6 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
   return false;
 }
 
-void AArch64PassConfig::addMachineSSAOptimization() {
-  // Run default MachineSSAOptimization first.
-  TargetPassConfig::addMachineSSAOptimization();
-
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createAArch64MIPeepholeOptPass());
-}
-
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());

diff  --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index aeedeb4eebac8..a77a66bacc4c2 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -66,7 +66,6 @@ add_llvm_target(AArch64CodeGen
   AArch64LowerHomogeneousPrologEpilog.cpp
   AArch64MachineFunctionInfo.cpp
   AArch64MacroFusion.cpp
-  AArch64MIPeepholeOpt.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
   AArch64PBQPRegAlloc.cpp

diff  --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 8765260935910..c3e74757675b6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 
-#include "AArch64ExpandImm.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/bit.h"

diff  --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 94c61d66a20d4..95816bd9d3262 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -40,7 +40,7 @@
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
@@ -132,7 +132,6 @@
 ; CHECK-NEXT:       Machine code sinking
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
-; CHECK-NEXT:       AArch64 MI Peephole Optimization pass
 ; CHECK-NEXT:       AArch64 Dead register definitions
 ; CHECK-NEXT:       Detect Dead Lanes
 ; CHECK-NEXT:       Process Implicit Definitions

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
deleted file mode 100644
index 9f6d9f88e73e4..0000000000000
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ /dev/null
@@ -1,245 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-
-define i8 @test1(i32 %a) {
-; CHECK-LABEL: test1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0x3ffc00
-; CHECK-NEXT:    and w8, w8, #0xffe007ff
-; CHECK-NEXT:    cmp w8, #1024
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-entry:
-  %and = and i32 %a, 2098176
-  %cmp = icmp eq i32 %and, 1024
-  %conv = zext i1 %cmp to i8
-  ret i8 %conv
-}
-
-; This constant should not be split because it can be handled by one mov.
-define i8 @test2(i32 %a) {
-; CHECK-LABEL: test2:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #135
-; CHECK-NEXT:    and w8, w0, w8
-; CHECK-NEXT:    cmp w8, #1024
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-entry:
-  %and = and i32 %a, 135
-  %cmp = icmp eq i32 %and, 1024
-  %conv = zext i1 %cmp to i8
-  ret i8 %conv
-}
-
-; This constant should not be split because the split immediate is not valid
-; bitmask immediate.
-define i8 @test3(i32 %a) {
-; CHECK-LABEL: test3:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024
-; CHECK-NEXT:    movk w8, #33, lsl #16
-; CHECK-NEXT:    and w8, w0, w8
-; CHECK-NEXT:    cmp w8, #1024
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-entry:
-  %and = and i32 %a, 2163712
-  %cmp = icmp eq i32 %and, 1024
-  %conv = zext i1 %cmp to i8
-  ret i8 %conv
-}
-
-define i8 @test4(i64 %a) {
-; CHECK-LABEL: test4:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and x8, x0, #0x3ffc00
-; CHECK-NEXT:    and x8, x8, #0xffffffffffe007ff
-; CHECK-NEXT:    cmp x8, #1024
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-entry:
-  %and = and i64 %a, 2098176
-  %cmp = icmp eq i64 %and, 1024
-  %conv = zext i1 %cmp to i8
-  ret i8 %conv
-}
-
-define i8 @test5(i64 %a) {
-; CHECK-LABEL: test5:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and x8, x0, #0x3ffffc000
-; CHECK-NEXT:    and x8, x8, #0xfffffffe00007fff
-; CHECK-NEXT:    cmp x8, #1024
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-entry:
-  %and = and i64 %a, 8589950976
-  %cmp = icmp eq i64 %and, 1024
-  %conv = zext i1 %cmp to i8
-  ret i8 %conv
-}
-
-; This constant should not be split because it can be handled by one mov.
-define i8 @test6(i64 %a) {
-; CHECK-LABEL: test6:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #135
-; CHECK-NEXT:    and x8, x0, x8
-; CHECK-NEXT:    cmp x8, #1024
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-entry:
-  %and = and i64 %a, 135
-  %cmp = icmp eq i64 %and, 1024
-  %conv = zext i1 %cmp to i8
-  ret i8 %conv
-}
-
-; This constant should not be split because the split immediate is not valid
-; bitmask immediate.
-define i8 @test7(i64 %a) {
-; CHECK-LABEL: test7:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024
-; CHECK-NEXT:    movk w8, #33, lsl #16
-; CHECK-NEXT:    and x8, x0, x8
-; CHECK-NEXT:    cmp x8, #1024
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-entry:
-  %and = and i64 %a, 2163712
-  %cmp = icmp eq i64 %and, 1024
-  %conv = zext i1 %cmp to i8
-  ret i8 %conv
-}
-
-; The split bitmask immediates should be hoisted outside loop because they are
-; loop invariant.
-define void @test8(i64 %a, i64* noalias %src, i64* noalias %dst, i64 %n) {
-; CHECK-LABEL: test8:
-; CHECK:       // %bb.0: // %loop.ph
-; CHECK-NEXT:    and x9, x0, #0x3ffc00
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x9, x9, #0xffffffffffe007ff
-; CHECK-NEXT:    b .LBB7_2
-; CHECK-NEXT:  .LBB7_1: // %for.inc
-; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    cmp x8, x3
-; CHECK-NEXT:    b.gt .LBB7_4
-; CHECK-NEXT:  .LBB7_2: // %loop
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.hs .LBB7_1
-; CHECK-NEXT:  // %bb.3: // %if.then
-; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; CHECK-NEXT:    lsl x10, x8, #3
-; CHECK-NEXT:    ldr x11, [x1, x10]
-; CHECK-NEXT:    str x11, [x2, x10]
-; CHECK-NEXT:    b .LBB7_1
-; CHECK-NEXT:  .LBB7_4: // %exit
-; CHECK-NEXT:    ret
-loop.ph:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %inc, %for.inc ], [ 0, %loop.ph ]
-  %and = and i64 %a, 2098176
-  %cmp = icmp ult i64 %iv, %and
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %src.arrayidx = getelementptr inbounds i64, i64* %src, i64 %iv
-  %val = load i64, i64* %src.arrayidx
-  %dst.arrayidx = getelementptr inbounds i64, i64* %dst, i64 %iv
-  store i64 %val, i64* %dst.arrayidx
-  br label %for.inc
-
-if.else:
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i64 %iv, 1
-  %cond = icmp sgt i64 %inc, %n
-  br i1 %cond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-; This constant should not be split because the `and` is not loop invariant.
-define i32 @test9(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) {
-; CHECK-LABEL: test9:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w2, #1
-; CHECK-NEXT:    b.lt .LBB8_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w9, #1024
-; CHECK-NEXT:    mov w8, w2
-; CHECK-NEXT:    movk w9, #32, lsl #16
-; CHECK-NEXT:  .LBB8_2: // %for.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr w10, [x1], #4
-; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    and w10, w10, w9
-; CHECK-NEXT:    str w10, [x0], #4
-; CHECK-NEXT:    b.ne .LBB8_2
-; CHECK-NEXT:  .LBB8_3: // %for.cond.cleanup
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-entry:
-  %cmp8 = icmp sgt i32 %n, 0
-  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %n to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret i32 0
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %and = and i32 %0, 2098176
-  %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
-  store i32 %and, i32* %arrayidx2, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; After instruction selection end, we can see the `and` and `or` share the
-; constant as below.
-;
-; %4:gpr32 = MOVi32imm 2098176
-; %5:gpr32 = ANDWrr killed %3:gpr32, %4:gpr32
-; STRWui killed %5:gpr32, %0:gpr64common, 0 :: (store (s32) into %ir.x, !tbaa !8)
-; %6:gpr32 = LDRWui %1:gpr64common, 0 :: (load (s32) from %ir.y, !tbaa !8)
-; %7:gpr32 = ORRWrr killed %6:gpr32, %4:gpr32
-;
-; In this case, the constant should not be split because it causes more
-; instructions.
-define void @test10(i32* nocapture %x, i32* nocapture readonly %y, i32* nocapture %z) {
-; CHECK-LABEL: test10:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr w8, [x1]
-; CHECK-NEXT:    mov w9, #1024
-; CHECK-NEXT:    movk w9, #32, lsl #16
-; CHECK-NEXT:    and w8, w8, w9
-; CHECK-NEXT:    str w8, [x0]
-; CHECK-NEXT:    ldr w8, [x1]
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    str w8, [x2]
-; CHECK-NEXT:    ret
-entry:
-  %0 = load i32, i32* %y, align 4
-  %and = and i32 %0, 2098176
-  store i32 %and, i32* %x, align 4
-  %1 = load i32, i32* %y, align 4
-  %or = or i32 %1, 2098176
-  store i32 %or, i32* %z, align 4
-  ret void
-}

diff  --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
index 3e30f45cfabb3..2e20ef67b2a2d 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -245,9 +245,10 @@ define i32 @in_multiuse_B_constmask(i32 %x, i32 %y, i32 %z) nounwind {
 define i32 @n0_badconstmask(i32 %x, i32 %y) {
 ; CHECK-LABEL: n0_badconstmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w1, #0xffffff00
+; CHECK-NEXT:    mov w9, #256
+; CHECK-NEXT:    movk w9, #65280, lsl #16
 ; CHECK-NEXT:    and w8, w0, #0xffff00
-; CHECK-NEXT:    and w9, w9, #0xff0001ff
+; CHECK-NEXT:    and w9, w1, w9
 ; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %mx = and i32 %x, 16776960