[llvm] f65651c - [AArch64] Fixes ADD/SUB opt bug and abstracts shared behavior in MIPeepholeOpt for ADD, SUB, and AND.

Tue Jan 25 20:22:41 PST 2022

Author: Micah Weston
Date: 2022-01-26T04:22:27Z
New Revision: f65651cc8aa87ba3a505ba3edebf81a9797685a2

URL: https://github.com/llvm/llvm-project/commit/f65651cc8aa87ba3a505ba3edebf81a9797685a2
DIFF: https://github.com/llvm/llvm-project/commit/f65651cc8aa87ba3a505ba3edebf81a9797685a2.diff

LOG: [AArch64] Fixes ADD/SUB opt bug and abstracts shared behavior in MIPeepholeOpt for ADD, SUB, and AND.

This fixes a bug where (SUBREG_TO_REG 0 (MOVi32imm <negative-number>) sub_32)
would generate invalid code since the top 32-bits were not zeroed when inspecting the
immediate value. A new test was added for this case.

Change to abstract shared behavior in MIPeepholeOpt. Both
visitAND and visitADDSUB attempt to split an RR instruction with an immediate
operand into two RI instructions with the immediate split.

The differing behavior lies in how the immediate is split into two pieces and
how the new instructions are built. The rest of the behavior (adding new VRegs,
checking for the MOVImm, constraining reg classes, removing old intructions)
are shared between the operations.

The new helper function splitTwoPartImm implements the shared behavior and
delegates differing behavior to two function objects passed by the caller.
One function object splits the immediate into two values and returns the
opcode to use if it is a valid split. The other function object builds
the new instructions.

I felt this abstraction would help since I believe it will help reduce the
code repetition when adding new instructions of the pattern, such as
SUBS for this conditional optimization.

Tested it locally by running check all with compiler-rt, mlir, clang-tools-extra,
flang, llvm, and clang enabled.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D118000

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
    llvm/test/CodeGen/AArch64/addsub.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 0ddfc717e47f3..1fc5617b49f66 100644

--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -37,6 +37,7 @@
 #include "AArch64ExpandImm.h"
 #include "AArch64InstrInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -55,17 +56,44 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   }
 
   const AArch64InstrInfo *TII;
+  const AArch64RegisterInfo *TRI;
   MachineLoopInfo *MLI;
   MachineRegisterInfo *MRI;
 
+  template <typename T>
+  using SplitAndOpcFunc =
+      std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
+  using BuildMIFunc =
+      std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
+                         Register, Register)>;
+
+  /// For instructions where an immediate operand could be split into two
+  /// separate immediate instructions, use the splitTwoPartImm two handle the
+  /// optimization.
+  ///
+  /// To implement, the following function types must be passed to
+  /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
+  /// splitting the immediate is valid and returns the associated new opcode. A
+  /// BuildMIFunc must be implemented to build the two immediate instructions.
+  ///
+  /// Example Pattern (where IMM would require 2+ MOV instructions):
+  ///     %dst = <Instr>rr %src IMM [...]
+  /// becomes:
+  ///     %tmp = <Instr>ri %src (encode half IMM) [...]
+  ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
+  template <typename T>
+  bool splitTwoPartImm(MachineInstr &MI,
+                       SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
+                       SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
+
   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
                         MachineInstr *&SubregToRegMI);
 
   template <typename T>
-  bool visitADDSUB(MachineInstr &MI,
-                   SmallSetVector<MachineInstr *, 8> &ToBeRemoved, bool IsAdd);
+  bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
+                   SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   template <typename T>
-  bool visitAND(MachineInstr &MI,
+  bool visitAND(unsigned Opc, MachineInstr &MI,
                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   bool visitORR(MachineInstr &MI,
                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
@@ -129,7 +157,8 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 
 template <typename T>
 bool AArch64MIPeepholeOpt::visitAND(
-    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+    unsigned Opc, MachineInstr &MI,
+    SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
   // Try below transformation.
   //
   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
@@ -140,59 +169,25 @@ bool AArch64MIPeepholeOpt::visitAND(
   // bitmask immediates. It makes only two AND instructions intead of multiple
   // mov + and instructions.
 
-  unsigned RegSize = sizeof(T) * 8;
-  assert((RegSize == 32 || RegSize == 64) &&
-         "Invalid RegSize for AND bitmask peephole optimization");
-
-  // Perform several essential checks against current MI.
-  MachineInstr *MovMI = nullptr, *SubregToRegMI = nullptr;
-  if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
-    return false;
-
-  // Split the bitmask immediate into two.
-  T UImm = static_cast<T>(MovMI->getOperand(1).getImm());
-  // For the 32 bit form of instruction, the upper 32 bits of the destination
-  // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
-  // of UImm to zero.
-  if (SubregToRegMI)
-    UImm &= 0xFFFFFFFF;
-  T Imm1Enc;
-  T Imm2Enc;
-  if (!splitBitmaskImm(UImm, RegSize, Imm1Enc, Imm2Enc))
-    return false;
-
-  // Create new AND MIs.
-  DebugLoc DL = MI.getDebugLoc();
-  MachineBasicBlock *MBB = MI.getParent();
-  const TargetRegisterClass *ANDImmRC =
-      (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  Register NewTmpReg = MRI->createVirtualRegister(ANDImmRC);
-  Register NewDstReg = MRI->createVirtualRegister(ANDImmRC);
-  unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri;
-
-  MRI->constrainRegClass(NewTmpReg, MRI->getRegClass(SrcReg));
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
-      .addReg(SrcReg)
-      .addImm(Imm1Enc);
-
-  MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
-      .addReg(NewTmpReg)
-      .addImm(Imm2Enc);
-
-  MRI->replaceRegWith(DstReg, NewDstReg);
-  // replaceRegWith changes MI's definition register. Keep it for SSA form until
-  // deleting MI.
-  MI.getOperand(0).setReg(DstReg);
-
-  ToBeRemoved.insert(&MI);
-  if (SubregToRegMI)
-    ToBeRemoved.insert(SubregToRegMI);
-  ToBeRemoved.insert(MovMI);
-
-  return true;
+  return splitTwoPartImm<T>(
+      MI, ToBeRemoved,
+      [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
+        if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
+          return Opc;
+        return None;
+      },
+      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
+                   Register NewDstReg) {
+        DebugLoc DL = MI.getDebugLoc();
+        MachineBasicBlock *MBB = MI.getParent();
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+            .addReg(SrcReg)
+            .addImm(Imm0);
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+            .addReg(NewTmpReg)
+            .addImm(Imm1);
+      });
 }
 
 bool AArch64MIPeepholeOpt::visitORR(
@@ -260,8 +255,8 @@ static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
 
 template <typename T>
 bool AArch64MIPeepholeOpt::visitADDSUB(
-    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
-    bool IsAdd) {
+    unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
+    SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
   // Try below transformation.
   //
   // MOVi32imm + ADDWrr ==> ADDWri + ADDWri
@@ -275,66 +270,30 @@ bool AArch64MIPeepholeOpt::visitADDSUB(
   // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
   // multiple `mov` + `and/sub` instructions.
 
-  unsigned RegSize = sizeof(T) * 8;
-  assert((RegSize == 32 || RegSize == 64) &&
-         "Invalid RegSize for legal add/sub immediate peephole optimization");
-
-  // Perform several essential checks against current MI.
-  MachineInstr *MovMI, *SubregToRegMI;
-  if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
-    return false;
-
-  // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
-  T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
-  unsigned Opcode;
-  if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) {
-    if (IsAdd)
-      Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri;
-    else
-      Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri;
-  } else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) {
-    if (IsAdd)
-      Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri;
-    else
-      Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri;
-  } else {
-    return false;
-  }
-
-  // Create new ADD/SUB MIs.
-  DebugLoc DL = MI.getDebugLoc();
-  MachineBasicBlock *MBB = MI.getParent();
-  const TargetRegisterClass *RC =
-      (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  Register NewTmpReg = MRI->createVirtualRegister(RC);
-  Register NewDstReg = MRI->createVirtualRegister(RC);
-
-  MRI->constrainRegClass(SrcReg, RC);
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
-      .addReg(SrcReg)
-      .addImm(Imm0)
-      .addImm(12);
-
-  MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
-      .addReg(NewTmpReg)
-      .addImm(Imm1)
-      .addImm(0);
-
-  MRI->replaceRegWith(DstReg, NewDstReg);
-  // replaceRegWith changes MI's definition register. Keep it for SSA form until
-  // deleting MI.
-  MI.getOperand(0).setReg(DstReg);
-
-  // Record the MIs need to be removed.
-  ToBeRemoved.insert(&MI);
-  if (SubregToRegMI)
-    ToBeRemoved.insert(SubregToRegMI);
-  ToBeRemoved.insert(MovMI);
-
-  return true;
+  return splitTwoPartImm<T>(
+      MI, ToBeRemoved,
+      [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
+                       T &Imm1) -> Optional<unsigned> {
+        if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
+          return PosOpc;
+        if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
+          return NegOpc;
+        return None;
+      },
+      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
+                   Register NewDstReg) {
+        DebugLoc DL = MI.getDebugLoc();
+        MachineBasicBlock *MBB = MI.getParent();
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+            .addReg(SrcReg)
+            .addImm(Imm0)
+            .addImm(12);
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+            .addReg(NewTmpReg)
+            .addImm(Imm1)
+            .addImm(0);
+      });
 }
 
 // Checks if the corresponding MOV immediate instruction is applicable for
@@ -377,15 +336,75 @@ bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
   return true;
 }
 
+template <typename T>
+bool AArch64MIPeepholeOpt::splitTwoPartImm(
+    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
+    SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
+  unsigned RegSize = sizeof(T) * 8;
+  assert((RegSize == 32 || RegSize == 64) &&
+         "Invalid RegSize for legal immediate peephole optimization");
+
+  // Perform several essential checks against current MI.
+  MachineInstr *MovMI, *SubregToRegMI;
+  if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
+    return false;
+
+  // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
+  T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
+  // For the 32 bit form of instruction, the upper 32 bits of the destination
+  // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
+  // of Imm to zero. This is essential if the Immediate value was a negative
+  // number since it was sign extended when we assign to the 64-bit Imm.
+  if (SubregToRegMI)
+    Imm &= 0xFFFFFFFF;
+  unsigned Opcode;
+  if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
+    Opcode = R.getValue();
+  else
+    return false;
+
+  // Create new ADD/SUB MIs.
+  MachineFunction *MF = MI.getMF();
+  const TargetRegisterClass *RC =
+      TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
+  const TargetRegisterClass *ORC =
+      TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register NewTmpReg = MRI->createVirtualRegister(RC);
+  Register NewDstReg = MRI->createVirtualRegister(RC);
+
+  MRI->constrainRegClass(SrcReg, RC);
+  MRI->constrainRegClass(NewTmpReg, ORC);
+  MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
+
+  BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
+
+  MRI->replaceRegWith(DstReg, NewDstReg);
+  // replaceRegWith changes MI's definition register. Keep it for SSA form until
+  // deleting MI.
+  MI.getOperand(0).setReg(DstReg);
+
+  // Record the MIs need to be removed.
+  ToBeRemoved.insert(&MI);
+  if (SubregToRegMI)
+    ToBeRemoved.insert(SubregToRegMI);
+  ToBeRemoved.insert(MovMI);
+
+  return true;
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = static_cast<const AArch64RegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
 
-  assert (MRI->isSSA() && "Expected to be run on SSA form!");
+  assert(MRI->isSSA() && "Expected to be run on SSA form!");
 
   bool Changed = false;
   SmallSetVector<MachineInstr *, 8> ToBeRemoved;
@@ -396,25 +415,29 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       default:
         break;
       case AArch64::ANDWrr:
-        Changed = visitAND<uint32_t>(MI, ToBeRemoved);
+        Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved);
         break;
       case AArch64::ANDXrr:
-        Changed = visitAND<uint64_t>(MI, ToBeRemoved);
+        Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved);
         break;
       case AArch64::ORRWrs:
         Changed = visitORR(MI, ToBeRemoved);
         break;
       case AArch64::ADDWrr:
-        Changed = visitADDSUB<uint32_t>(MI, ToBeRemoved, true);
+        Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI,
+                                        ToBeRemoved);
         break;
       case AArch64::SUBWrr:
-        Changed = visitADDSUB<uint32_t>(MI, ToBeRemoved, false);
+        Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI,
+                                        ToBeRemoved);
         break;
       case AArch64::ADDXrr:
-        Changed = visitADDSUB<uint64_t>(MI, ToBeRemoved, true);
+        Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI,
+                                        ToBeRemoved);
         break;
       case AArch64::SUBXrr:
-        Changed = visitADDSUB<uint64_t>(MI, ToBeRemoved, false);
+        Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
+                                        ToBeRemoved);
         break;
       }
     }

diff  --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index 37c9e4c5c6fe1..b95c15ac6d073 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -389,4 +389,21 @@ define i1 @uadd_add(i8 %a, i8 %b, i8* %p) {
   ret i1 %e1
 }
 
+; This is a unique edge case that will generate the following MIR
+;   MOVi32imm -1000000
+;   SUBREG_TO_REG 0, killed %1, %subreg.sub_32
+; When using a 64-bit unsigned for the "-1000000" immediate, the code
+; must make sure to zero out the top 32 bits since SUBREG_TO_REG is
+; zero extending the value
+define i64 @addl_0x80000000(i64 %a) {
+; CHECK-LABEL: addl_0x80000000:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48576
+; CHECK-NEXT:    movk w8, #65520, lsl #16
+; CHECK-NEXT:    add x0, x0, x8
+; CHECK-NEXT:    ret
+  %b = add i64 %a, 4293967296
+  ret i64 %b
+}
+
 ; TODO: adds/subs