[llvm] [M68k] Introduce more MOVI cases (PR #90817)

Wed May 1 20:21:53 PDT 2024

https://github.com/n8pjl created https://github.com/llvm/llvm-project/pull/90817

Add two more special cases for loading registers with immediates.

The first allows values in the range of [-255, 255] to be loaded with MOVEQ,
even if the register is more than 8 bits and the sign extention is
unwanted. This is done by loading the bitwise complement of the desired
value, then performing a NOT instruction on the loaded register.

This special case is only used when a simple MOVEQ cannot be used, and
is only used for 32 bit data registers. Address registers cannot support
MOVEQ, and the two-instruction sequence is no faster or smaller than a plain
MOVE instruction when loading 16 bit immediates on the 68000, and likely
slower for more sophisticated microarchitectures. However, the
instruction sequence is both smaller and faster than the corresponding
MOVE instruction for 32 bit register widths.

The second special case is for zeroing address registers. This simply
expands to subtracting a register with itself, consuming one instruction
word rather than 2-3, with a small improvement in speed as well.


>From 0dbbfdb0fcc05dee7fa8eb47c910a5eb5fdd2253 Mon Sep 17 00:00:00 2001
From: Peter Lafreniere <peter at n8pjl.ca>
Date: Wed, 1 May 2024 23:04:37 -0400
Subject: [PATCH] [M68k] Introduce more MOVI cases

Add two more special cases for loading registers with immediates.

The first allows values in the range of [-255, 255] to be loaded with MOVEQ,
even if the register is more than 8 bits and the sign extention is
unwanted. This is done by loading the bitwise complement of the desired
value, then performing a NOT instruction on the loaded register.

This special case is only used when a simple MOVEQ cannot be used, and
is only used for 32 bit data registers. Address registers cannot support
MOVEQ, and the two-instruction sequence is no faster or smaller than a plain
MOVE instruction when loading 16 bit immediates on the 68000, and likely
slower for more sophisticated microarchitectures. However, the
instruction sequence is both smaller and faster than the corresponding
MOVE instruction for 32 bit register widths.

The second special case is for zeroing address registers. This simply
expands to subtracting a register with itself, consuming one instruction
word rather than 2-3, with a small improvement in speed as well.
---
 llvm/lib/Target/M68k/M68kInstrInfo.cpp        |  45 +++++-
 llvm/test/CodeGen/M68k/Arith/add.ll           |   3 +-
 .../CodeGen/M68k/{ => Data}/link-unlnk.ll     |   0
 .../CodeGen/M68k/{ => Data}/load-extend.ll    |   0
 llvm/test/CodeGen/M68k/Data/load-imm.ll       | 140 ++++++++++++++++++
 5 files changed, 180 insertions(+), 8 deletions(-)
 rename llvm/test/CodeGen/M68k/{ => Data}/link-unlnk.ll (100%)
 rename llvm/test/CodeGen/M68k/{ => Data}/load-extend.ll (100%)
 create mode 100644 llvm/test/CodeGen/M68k/Data/load-imm.ll

diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 338db45782c961..ab92f3fff0e9aa 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -346,8 +346,8 @@ void M68kInstrInfo::AddZExt(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(And), Reg).addReg(Reg).addImm(Mask);
 }
 
-// Convert MOVI to MOVQ if the target is a data register and the immediate
-// fits in a sign-extended i8, otherwise emit a plain MOV.
+// Convert MOVI to the appropriate instruction (sequence) for setting
+// the register to an immediate value.
 bool M68kInstrInfo::ExpandMOVI(MachineInstrBuilder &MIB, MVT MVTSize) const {
   Register Reg = MIB->getOperand(0).getReg();
   int64_t Imm = MIB->getOperand(1).getImm();
@@ -360,18 +360,49 @@ bool M68kInstrInfo::ExpandMOVI(MachineInstrBuilder &MIB, MVT MVTSize) const {
   if (AR16->contains(Reg) || AR32->contains(Reg))
     IsAddressReg = true;
 
+  // We need to assign to the full register to make IV happy
+  Register SReg =
+      MVTSize == MVT::i32
+          ? Reg
+          : Register(RI.getMatchingMegaReg(Reg, IsAddressReg ? AR32 : DR32));
+  assert(SReg && "No viable MEGA register available");
+
   LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to ");
 
+  // Sign extention doesn't matter if we only use the bottom 8 bits
   if (MVTSize == MVT::i8 || (!IsAddressReg && Imm >= -128 && Imm <= 127)) {
     LLVM_DEBUG(dbgs() << "MOVEQ\n");
 
-    // We need to assign to the full register to make IV happy
-    Register SReg =
-        MVTSize == MVT::i32 ? Reg : Register(RI.getMatchingMegaReg(Reg, DR32));
-    assert(SReg && "No viable MEGA register available");
-
     MIB->setDesc(get(M68k::MOVQ));
     MIB->getOperand(0).setReg(SReg);
+
+    // Counter the effects of sign-extension with a bitwise not.
+    // This is only faster and smaller for 32 bit values.
+  } else if (DR32->contains(Reg) && std::abs(Imm) <= 255) {
+    LLVM_DEBUG(dbgs() << "MOVEQ and NOT\n");
+
+    MachineBasicBlock &MBB = *MIB->getParent();
+    DebugLoc DL = MIB->getDebugLoc();
+
+    BuildMI(MBB, MIB.getInstr(), DL, get(M68k::MOVQ), SReg).addImm(~Imm);
+    BuildMI(MBB, MIB.getInstr(), DL, get(M68k::NOT32d), SReg).addReg(SReg);
+
+    MIB->removeFromParent();
+
+    // Special case for setting address register to NULL (0)
+  } else if (IsAddressReg && Imm == 0) {
+    LLVM_DEBUG(dbgs() << "SUBA\n");
+
+    MachineBasicBlock &MBB = *MIB->getParent();
+    DebugLoc DL = MIB->getDebugLoc();
+
+    BuildMI(MBB, MIB.getInstr(), DL, get(M68k::SUB32ar), SReg)
+        .addReg(SReg, RegState::Undef)
+        .addReg(SReg, RegState::Undef);
+
+    MIB->removeFromParent();
+
+    // Fall back to a move with immediate
   } else {
     LLVM_DEBUG(dbgs() << "MOVE\n");
     MIB->setDesc(get(MVTSize == MVT::i16 ? M68k::MOV16ri : M68k::MOV32ri));
diff --git a/llvm/test/CodeGen/M68k/Arith/add.ll b/llvm/test/CodeGen/M68k/Arith/add.ll
index a9eb0bb815b088..dab3851afbe6c2 100644
--- a/llvm/test/CodeGen/M68k/Arith/add.ll
+++ b/llvm/test/CodeGen/M68k/Arith/add.ll
@@ -65,7 +65,8 @@ define fastcc void @test4(ptr inreg %a) nounwind {
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
 ; CHECK-NEXT:    move.l (%a0), %d0
 ; CHECK-NEXT:    moveq #0, %d1
-; CHECK-NEXT:    move.l #128, %d2
+; CHECK-NEXT:    moveq #-129, %d2
+; CHECK-NEXT:    not.l %d2
 ; CHECK-NEXT:    add.l (4,%a0), %d2
 ; CHECK-NEXT:    addx.l %d0, %d1
 ; CHECK-NEXT:    move.l %d2, (4,%a0)
diff --git a/llvm/test/CodeGen/M68k/link-unlnk.ll b/llvm/test/CodeGen/M68k/Data/link-unlnk.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/link-unlnk.ll
rename to llvm/test/CodeGen/M68k/Data/link-unlnk.ll
diff --git a/llvm/test/CodeGen/M68k/load-extend.ll b/llvm/test/CodeGen/M68k/Data/load-extend.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/load-extend.ll
rename to llvm/test/CodeGen/M68k/Data/load-extend.ll
diff --git a/llvm/test/CodeGen/M68k/Data/load-imm.ll b/llvm/test/CodeGen/M68k/Data/load-imm.ll
new file mode 100644
index 00000000000000..d63aeba7d2bc93
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/Data/load-imm.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=m68k-linux -verify-machineinstrs | FileCheck %s
+
+define i1 @return_true() {
+; CHECK-LABEL: return_true:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #1, %d0
+; CHECK-NEXT:    rts
+  ret i1 true
+}
+
+define i8 @return_0_i8() {
+; CHECK-LABEL: return_0_i8:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #0, %d0
+; CHECK-NEXT:    rts
+  ret i8 0
+}
+
+define i16 @return_0_i16() {
+; CHECK-LABEL: return_0_i16:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #0, %d0
+; CHECK-NEXT:    rts
+  ret i16 0
+}
+
+define i32 @return_0_i32() {
+; CHECK-LABEL: return_0_i32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #0, %d0
+; CHECK-NEXT:    rts
+  ret i32 0
+}
+
+define i64 @return_0_i64() {
+; CHECK-LABEL: return_0_i64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #0, %d0
+; CHECK-NEXT:    move.l %d0, %d1
+; CHECK-NEXT:    rts
+  ret i64 0
+}
+
+define i16 @return_neg1_i16() {
+; CHECK-LABEL: return_neg1_i16:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #-1, %d0
+; CHECK-NEXT:    rts
+  ret i16 -1
+}
+
+define i32 @return_neg1_i32() {
+; CHECK-LABEL: return_neg1_i32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #-1, %d0
+; CHECK-NEXT:    rts
+  ret i32 -1
+}
+
+define i8 @return_160_i8() {
+; CHECK-LABEL: return_160_i8:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #-96, %d0
+; CHECK-NEXT:    rts
+  ret i8 160
+}
+
+define i16 @return_160_i16() {
+; CHECK-LABEL: return_160_i16:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    move.w #160, %d0
+; CHECK-NEXT:    rts
+  ret i16 160
+}
+
+define i32 @return_160_i32() {
+; CHECK-LABEL: return_160_i32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #-161, %d0
+; CHECK-NEXT:    not.l %d0
+; CHECK-NEXT:    rts
+  ret i32 160
+}
+
+define i16 @return_14281_i16() {
+; CHECK-LABEL: return_14281_i16:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    move.w #14281, %d0
+; CHECK-NEXT:    rts
+  ret i16 14281
+}
+
+define i32 @return_14281_i32() {
+; CHECK-LABEL: return_14281_i32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    move.l #14281, %d0
+; CHECK-NEXT:    rts
+  ret i32 14281
+}
+
+define i64 @return_14281_i64() {
+; CHECK-LABEL: return_14281_i64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    moveq #0, %d0
+; CHECK-NEXT:    move.l #14281, %d1
+; CHECK-NEXT:    rts
+  ret i64 14281
+}
+
+define ptr @return_null() {
+; CHECK-LABEL: return_null:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    suba.l %a0, %a0
+; CHECK-NEXT:    rts
+  ret ptr null
+}
+
+define ptr @return_nonnull() {
+; CHECK-LABEL: return_nonnull:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    move.l #200, %a0
+; CHECK-NEXT:    rts
+  ret ptr inttoptr (i32 200 to ptr)
+}