[llvm] 30caca3 - Third Recommit "[AArch64] Split bitmask immediate of bitwise AND operation"
Jingu Kang via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 8 03:29:46 PDT 2021
Author: Jingu Kang
Date: 2021-10-08T11:28:49+01:00
New Revision: 30caca39f401ae17927439c0a0bd6d1b1916dd6a
URL: https://github.com/llvm/llvm-project/commit/30caca39f401ae17927439c0a0bd6d1b1916dd6a
DIFF: https://github.com/llvm/llvm-project/commit/30caca39f401ae17927439c0a0bd6d1b1916dd6a.diff
LOG: Third Recommit "[AArch64] Split bitmask immediate of bitwise AND operation"
This reverts the revert commit fc36fb4d23a5e419cf33002c87c0082f682cb77b with
bug fixes.
Differential Revision: https://reviews.llvm.org/D109963
Added:
llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
Modified:
llvm/lib/Target/AArch64/AArch64.h
llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
llvm/lib/Target/AArch64/CMakeLists.txt
llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
llvm/test/CodeGen/AArch64/O3-pipeline.ll
llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 658d44771e8d6..b0dd30c13137f 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -51,6 +51,7 @@ FunctionPass *createAArch64A53Fix835769();
FunctionPass *createFalkorHWPFFixPass();
FunctionPass *createFalkorMarkStridedAccessesPass();
FunctionPass *createAArch64BranchTargetsPass();
+FunctionPass *createAArch64MIPeepholeOptPass();
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
@@ -82,6 +83,7 @@ void initializeAArch64SLSHardeningPass(PassRegistry&);
void initializeAArch64SpeculationHardeningPass(PassRegistry&);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
+void initializeAArch64MIPeepholeOptPass(PassRegistry &);
void initializeAArch64SIMDInstrOptPass(PassRegistry&);
void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
new file mode 100644
index 0000000000000..d091c8fd6a033
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -0,0 +1,225 @@
+//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs below peephole optimizations on MIR level.
+//
+// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
+// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+//
+// The mov pseudo instruction could be expanded to multiple mov instructions
+// later. In this case, we could try to split the constant operand of mov
+// instruction into two bitmask immediates. It makes two AND instructions
+// intead of multiple `mov` + `and` instructions.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExpandImm.h"
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-mi-peephole-opt"
+
+namespace {
+
+struct AArch64MIPeepholeOpt : public MachineFunctionPass {
+ static char ID;
+
+ AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
+ initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ const AArch64InstrInfo *TII;
+ MachineLoopInfo *MLI;
+ MachineRegisterInfo *MRI;
+
+ template <typename T>
+ bool visitAND(MachineInstr &MI,
+ SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AArch64 MI Peephole Optimization pass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+char AArch64MIPeepholeOpt::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
+ "AArch64 MI Peephole Optimization", false, false)
+
+template <typename T>
+static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
+ T UImm = static_cast<T>(Imm);
+ if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
+ return false;
+
+ // If this immediate can be handled by one instruction, do not split it.
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
+ if (Insn.size() == 1)
+ return false;
+
+ // The bitmask immediate consists of consecutive ones. Let's say there is
+ // constant 0b00000000001000000000010000000000 which does not consist of
+ // consecutive ones. We can split it in to two bitmask immediate like
+ // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
+ // If we do AND with these two bitmask immediate, we can see original one.
+ unsigned LowestBitSet = countTrailingZeros(UImm);
+ unsigned HighestBitSet = Log2_64(UImm);
+
+ // Create a mask which is filled with one from the position of lowest bit set
+ // to the position of highest bit set.
+ T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
+ (static_cast<T>(1) << LowestBitSet);
+ // Create a mask which is filled with one outside the position of lowest bit
+ // set and the position of highest bit set.
+ T NewImm2 = UImm | ~NewImm1;
+
+ // If the split value is not valid bitmask immediate, do not split this
+ // constant.
+ if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+ return false;
+
+ Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+ Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+ return true;
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::visitAND(
+ MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+ // Try below transformation.
+ //
+ // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
+ // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+ //
+ // The mov pseudo instruction could be expanded to multiple mov instructions
+ // later. Let's try to split the constant operand of mov instruction into two
+ // bitmask immediates. It makes only two AND instructions intead of multiple
+ // mov + and instructions.
+
+ unsigned RegSize = sizeof(T) * 8;
+ assert((RegSize == 32 || RegSize == 64) &&
+ "Invalid RegSize for AND bitmask peephole optimization");
+
+ // Check whether AND's MBB is in loop and the AND is loop invariant.
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineLoop *L = MLI->getLoopFor(MBB);
+ if (L && !L->isLoopInvariant(MI))
+ return false;
+
+ // Check whether AND's operand is MOV with immediate.
+ MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
+ MachineInstr *SubregToRegMI = nullptr;
+ // If it is SUBREG_TO_REG, check its operand.
+ if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
+ SubregToRegMI = MovMI;
+ MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
+ }
+
+ if (MovMI->getOpcode() != AArch64::MOVi32imm &&
+ MovMI->getOpcode() != AArch64::MOVi64imm)
+ return false;
+
+ // If the MOV has multiple uses, do not split the immediate because it causes
+ // more instructions.
+ if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
+ return false;
+
+ if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
+ return false;
+
+ // Split the bitmask immediate into two.
+ T UImm = static_cast<T>(MovMI->getOperand(1).getImm());
+ // For the 32 bit form of instruction, the upper 32 bits of the destination
+ // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
+ // of UImm to zero.
+ if (SubregToRegMI)
+ UImm &= 0xFFFFFFFF;
+ T Imm1Enc;
+ T Imm2Enc;
+ if (!splitBitmaskImm(UImm, RegSize, Imm1Enc, Imm2Enc))
+ return false;
+
+ // Create new AND MIs.
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetRegisterClass *ANDImmRC =
+ (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register NewTmpReg = MRI->createVirtualRegister(ANDImmRC);
+ unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri;
+
+ MRI->constrainRegClass(NewTmpReg, MRI->getRegClass(SrcReg));
+ BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+ .addReg(SrcReg)
+ .addImm(Imm1Enc);
+
+ MRI->constrainRegClass(DstReg, ANDImmRC);
+ BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
+ .addReg(NewTmpReg)
+ .addImm(Imm2Enc);
+
+ ToBeRemoved.insert(&MI);
+ if (SubregToRegMI)
+ ToBeRemoved.insert(SubregToRegMI);
+ ToBeRemoved.insert(MovMI);
+
+ return true;
+}
+
+bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ MLI = &getAnalysis<MachineLoopInfo>();
+ MRI = &MF.getRegInfo();
+
+ if (!MRI->isSSA())
+ return false;
+
+ bool Changed = false;
+ SmallSetVector<MachineInstr *, 8> ToBeRemoved;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::ANDWrr:
+ Changed = visitAND<uint32_t>(MI, ToBeRemoved);
+ break;
+ case AArch64::ANDXrr:
+ Changed = visitAND<uint64_t>(MI, ToBeRemoved);
+ break;
+ }
+ }
+ }
+
+ for (MachineInstr *MI : ToBeRemoved)
+ MI->eraseFromParent();
+
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
+ return new AArch64MIPeepholeOpt();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6127f890118f9..5d26b6d41b4c5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -195,6 +195,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeAArch64DeadRegisterDefinitionsPass(*PR);
initializeAArch64ExpandPseudoPass(*PR);
initializeAArch64LoadStoreOptPass(*PR);
+ initializeAArch64MIPeepholeOptPass(*PR);
initializeAArch64SIMDInstrOptPass(*PR);
initializeAArch64O0PreLegalizerCombinerPass(*PR);
initializeAArch64PreLegalizerCombinerPass(*PR);
@@ -480,6 +481,7 @@ class AArch64PassConfig : public TargetPassConfig {
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
+ void addMachineSSAOptimization() override;
bool addILPOpts() override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
@@ -656,6 +658,14 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
return false;
}
+void AArch64PassConfig::addMachineSSAOptimization() {
+ // Run default MachineSSAOptimization first.
+ TargetPassConfig::addMachineSSAOptimization();
+
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createAArch64MIPeepholeOptPass());
+}
+
bool AArch64PassConfig::addILPOpts() {
if (EnableCondOpt)
addPass(createAArch64ConditionOptimizerPass());
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index a77a66bacc4c2..aeedeb4eebac8 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -66,6 +66,7 @@ add_llvm_target(AArch64CodeGen
AArch64LowerHomogeneousPrologEpilog.cpp
AArch64MachineFunctionInfo.cpp
AArch64MacroFusion.cpp
+ AArch64MIPeepholeOpt.cpp
AArch64MCInstLower.cpp
AArch64PromoteConstant.cpp
AArch64PBQPRegAlloc.cpp
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index c3e74757675b6..8765260935910 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+#include "AArch64ExpandImm.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/bit.h"
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 95816bd9d3262..94c61d66a20d4 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -40,7 +40,7 @@
; CHECK-NEXT: Induction Variable Users
; CHECK-NEXT: Loop Strength Reduction
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT: Function Alias Analysis Results
+; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Merge contiguous icmps into a memcmp
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Lazy Branch Probability Analysis
@@ -132,6 +132,7 @@
; CHECK-NEXT: Machine code sinking
; CHECK-NEXT: Peephole Optimizations
; CHECK-NEXT: Remove dead machine instructions
+; CHECK-NEXT: AArch64 MI Peephole Optimization pass
; CHECK-NEXT: AArch64 Dead register definitions
; CHECK-NEXT: Detect Dead Lanes
; CHECK-NEXT: Process Implicit Definitions
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
new file mode 100644
index 0000000000000..24b528fe7df01
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -0,0 +1,266 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+define i8 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0x3ffc00
+; CHECK-NEXT: and w8, w8, #0xffe007ff
+; CHECK-NEXT: cmp w8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i32 %a, 2098176
+ %cmp = icmp eq i32 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i8 @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135
+; CHECK-NEXT: and w8, w0, w8
+; CHECK-NEXT: cmp w8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i32 %a, 135
+ %cmp = icmp eq i32 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i8 @test3(i32 %a) {
+; CHECK-LABEL: test3:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: and w8, w0, w8
+; CHECK-NEXT: cmp w8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i32 %a, 2163712
+ %cmp = icmp eq i32 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
+
+define i8 @test4(i64 %a) {
+; CHECK-LABEL: test4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and x8, x0, #0x3ffc00
+; CHECK-NEXT: and x8, x8, #0xffffffffffe007ff
+; CHECK-NEXT: cmp x8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i64 %a, 2098176
+ %cmp = icmp eq i64 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
+
+define i8 @test5(i64 %a) {
+; CHECK-LABEL: test5:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and x8, x0, #0x3ffffc000
+; CHECK-NEXT: and x8, x8, #0xfffffffe00007fff
+; CHECK-NEXT: cmp x8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i64 %a, 8589950976
+ %cmp = icmp eq i64 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i8 @test6(i64 %a) {
+; CHECK-LABEL: test6:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135
+; CHECK-NEXT: and x8, x0, x8
+; CHECK-NEXT: cmp x8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i64 %a, 135
+ %cmp = icmp eq i64 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i8 @test7(i64 %a) {
+; CHECK-LABEL: test7:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: and x8, x0, x8
+; CHECK-NEXT: cmp x8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i64 %a, 2163712
+ %cmp = icmp eq i64 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
+
+; The split bitmask immediates should be hoisted outside loop because they are
+; loop invariant.
+define void @test8(i64 %a, i64* noalias %src, i64* noalias %dst, i64 %n) {
+; CHECK-LABEL: test8:
+; CHECK: // %bb.0: // %loop.ph
+; CHECK-NEXT: and x9, x0, #0x3ffc00
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: and x9, x9, #0xffffffffffe007ff
+; CHECK-NEXT: b .LBB7_2
+; CHECK-NEXT: .LBB7_1: // %for.inc
+; CHECK-NEXT: // in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: cmp x8, x3
+; CHECK-NEXT: b.gt .LBB7_4
+; CHECK-NEXT: .LBB7_2: // %loop
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.hs .LBB7_1
+; CHECK-NEXT: // %bb.3: // %if.then
+; CHECK-NEXT: // in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT: lsl x10, x8, #3
+; CHECK-NEXT: ldr x11, [x1, x10]
+; CHECK-NEXT: str x11, [x2, x10]
+; CHECK-NEXT: b .LBB7_1
+; CHECK-NEXT: .LBB7_4: // %exit
+; CHECK-NEXT: ret
+loop.ph:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ %inc, %for.inc ], [ 0, %loop.ph ]
+ %and = and i64 %a, 2098176
+ %cmp = icmp ult i64 %iv, %and
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %src.arrayidx = getelementptr inbounds i64, i64* %src, i64 %iv
+ %val = load i64, i64* %src.arrayidx
+ %dst.arrayidx = getelementptr inbounds i64, i64* %dst, i64 %iv
+ store i64 %val, i64* %dst.arrayidx
+ br label %for.inc
+
+if.else:
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i64 %iv, 1
+ %cond = icmp sgt i64 %inc, %n
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; This constant should not be split because the `and` is not loop invariant.
+define i32 @test9(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: test9:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w2, #1
+; CHECK-NEXT: b.lt .LBB8_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w9, #1024
+; CHECK-NEXT: mov w8, w2
+; CHECK-NEXT: movk w9, #32, lsl #16
+; CHECK-NEXT: .LBB8_2: // %for.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr w10, [x1], #4
+; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: and w10, w10, w9
+; CHECK-NEXT: str w10, [x0], #4
+; CHECK-NEXT: b.ne .LBB8_2
+; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+entry:
+ %cmp8 = icmp sgt i32 %n, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret i32 0
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %and = and i32 %0, 2098176
+ %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+ store i32 %and, i32* %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; After instruction selection end, we can see the `and` and `or` share the
+; constant as below.
+;
+; %4:gpr32 = MOVi32imm 2098176
+; %5:gpr32 = ANDWrr killed %3:gpr32, %4:gpr32
+; STRWui killed %5:gpr32, %0:gpr64common, 0 :: (store (s32) into %ir.x, !tbaa !8)
+; %6:gpr32 = LDRWui %1:gpr64common, 0 :: (load (s32) from %ir.y, !tbaa !8)
+; %7:gpr32 = ORRWrr killed %6:gpr32, %4:gpr32
+;
+; In this case, the constant should not be split because it causes more
+; instructions.
+define void @test10(i32* nocapture %x, i32* nocapture readonly %y, i32* nocapture %z) {
+; CHECK-LABEL: test10:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr w8, [x1]
+; CHECK-NEXT: mov w9, #1024
+; CHECK-NEXT: movk w9, #32, lsl #16
+; CHECK-NEXT: and w8, w8, w9
+; CHECK-NEXT: str w8, [x0]
+; CHECK-NEXT: ldr w8, [x1]
+; CHECK-NEXT: orr w8, w8, w9
+; CHECK-NEXT: str w8, [x2]
+; CHECK-NEXT: ret
+entry:
+ %0 = load i32, i32* %y, align 4
+ %and = and i32 %0, 2098176
+ store i32 %and, i32* %x, align 4
+ %1 = load i32, i32* %y, align 4
+ %or = or i32 %1, 2098176
+ store i32 %or, i32* %z, align 4
+ ret void
+}
+
+; This test genereates below MIs.
+;
+; MOVi32imm -1610612736
+; SUBREG_TO_REG
+;
+; The constant should be zero-extended to 64 bit and it should not be split.
+define i8 @test11(i64 %a) {
+; CHECK-LABEL: test11:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #-1610612736
+; CHECK-NEXT: and x8, x0, x8
+; CHECK-NEXT: cmp x8, #1024
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+entry:
+ %and = and i64 %a, 2684354560
+ %cmp = icmp eq i64 %and, 1024
+ %conv = zext i1 %cmp to i8
+ ret i8 %conv
+}
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
index 2e20ef67b2a2d..3e30f45cfabb3 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -245,10 +245,9 @@ define i32 @in_multiuse_B_constmask(i32 %x, i32 %y, i32 %z) nounwind {
define i32 @n0_badconstmask(i32 %x, i32 %y) {
; CHECK-LABEL: n0_badconstmask:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w9, #256
-; CHECK-NEXT: movk w9, #65280, lsl #16
+; CHECK-NEXT: and w9, w1, #0xffffff00
; CHECK-NEXT: and w8, w0, #0xffff00
-; CHECK-NEXT: and w9, w1, w9
+; CHECK-NEXT: and w9, w9, #0xff0001ff
; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
%mx = and i32 %x, 16776960
More information about the llvm-commits
mailing list