[llvm] 840d10a - [AVR] Custom lower 32-bit shift instructions
Ayke van Laethem via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 8 11:05:54 PST 2023
Author: Ayke van Laethem
Date: 2023-01-08T20:05:31+01:00
New Revision: 840d10a1d2c939016f387a041b0fbb3a9d592b17
URL: https://github.com/llvm/llvm-project/commit/840d10a1d2c939016f387a041b0fbb3a9d592b17
DIFF: https://github.com/llvm/llvm-project/commit/840d10a1d2c939016f387a041b0fbb3a9d592b17.diff
LOG: [AVR] Custom lower 32-bit shift instructions
32-bit shift instructions were previously expanded using the default
SelectionDAG expander, which meant it used 16-bit constant shifts and
ORed them together. This works, but is far from optimal.
I've optimized 32-bit shifts on AVR using a custom inserter. This is
done using three new pseudo-instructions that take the upper and lower
bits of the value in two separate 16-bit registers and outputs two
16-bit registers.
This is the first commit in a series. When completed, shift instructions
will take around 31% less instructions on average for constant 32-bit
shifts, and is in all cases equal or better than the old behavior. It
also tends to match or outperform avr-gcc: the only cases where avr-gcc
does better is when it uses a loop to shift, or when the LLVM register
allocator inserts some unnecessary movs. But it even outperforms avr-gcc
in some cases where avr-gcc does not use a loop.
As a side effect, non-constant 32-bit shifts also become more efficient.
For some real-world differences: the build of compiler-rt I use in
TinyGo becomes 2.7% smaller and the build of picolibc I use becomes 0.9%
smaller. I think picolibc is a better representation of real-world code,
but even a ~1% reduction in code size is really significant.
The current patch just lays the groundwork. The result is actually a
regression in code size. Later patches will use this as a basis to
optimize these shift instructions.
Differential Revision: https://reviews.llvm.org/D140569
Added:
llvm/test/CodeGen/AVR/shift32.ll
Modified:
llvm/lib/Target/AVR/AVRISelLowering.cpp
llvm/lib/Target/AVR/AVRISelLowering.h
llvm/lib/Target/AVR/AVRInstrInfo.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index eed25e820ce7..1770cdbdd5c9 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -88,6 +88,9 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i16, Custom);
setOperationAction(ISD::SHL, MVT::i16, Custom);
setOperationAction(ISD::SRL, MVT::i16, Custom);
+ setOperationAction(ISD::SRA, MVT::i32, Custom);
+ setOperationAction(ISD::SHL, MVT::i32, Custom);
+ setOperationAction(ISD::SRL, MVT::i32, Custom);
setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
@@ -247,10 +250,13 @@ const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE(CALL);
NODE(WRAPPER);
NODE(LSL);
+ NODE(LSLW);
NODE(LSR);
+ NODE(LSRW);
NODE(ROL);
NODE(ROR);
NODE(ASR);
+ NODE(ASRW);
NODE(LSLLOOP);
NODE(LSRLOOP);
NODE(ROLLOOP);
@@ -279,6 +285,57 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
assert(isPowerOf2_32(VT.getSizeInBits()) &&
"Expected power-of-2 shift amount");
+ if (VT.getSizeInBits() == 32) {
+ if (!isa<ConstantSDNode>(N->getOperand(1))) {
+ // 32-bit shifts are converted to a loop in IR.
+ // This should be unreachable.
+ report_fatal_error("Expected a constant shift amount!");
+ }
+ SDVTList ResTys = DAG.getVTList(MVT::i16, MVT::i16);
+ SDValue SrcLo =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
+ DAG.getConstant(0, dl, MVT::i16));
+ SDValue SrcHi =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
+ DAG.getConstant(1, dl, MVT::i16));
+ uint64_t ShiftAmount =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ if (ShiftAmount == 16) {
+ // Special case these two operations because they appear to be used by the
+ // generic codegen parts to lower 32-bit numbers.
+ // TODO: perhaps we can lower shift amounts bigger than 16 to a 16-bit
+ // shift of a part of the 32-bit value?
+ switch (Op.getOpcode()) {
+ case ISD::SHL: {
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Zero, SrcLo);
+ }
+ case ISD::SRL: {
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, SrcHi, Zero);
+ }
+ }
+ }
+ SDValue Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8);
+ unsigned Opc;
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Invalid 32-bit shift opcode!");
+ case ISD::SHL:
+ Opc = AVRISD::LSLW;
+ break;
+ case ISD::SRL:
+ Opc = AVRISD::LSRW;
+ break;
+ case ISD::SRA:
+ Opc = AVRISD::ASRW;
+ break;
+ }
+ SDValue Result = DAG.getNode(Opc, dl, ResTys, SrcLo, SrcHi, Cnt);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Result.getValue(0),
+ Result.getValue(1));
+ }
+
// Expand non-constant shifts to loops.
if (!isa<ConstantSDNode>(N->getOperand(1))) {
switch (Op.getOpcode()) {
@@ -1789,6 +1846,114 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
return RemBB;
}
+// Do a multibyte AVR shift. Insert shift instructions and put the output
+// registers in the Regs array.
+// Because AVR does not have a normal shift instruction (only a single bit shift
+// instruction), we have to emulate this behavior with other instructions.
+static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
+ MutableArrayRef<std::pair<Register, int>> Regs,
+ ISD::NodeType Opc, int64_t ShiftAmt) {
+ const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const DebugLoc &dl = MI.getDebugLoc();
+
+ const bool ShiftLeft = Opc == ISD::SHL;
+ const bool ArithmeticShift = Opc == ISD::SRA;
+
+ // Shift by one. This is the fallback that always works, and the shift
+ // operation that is used for 1, 2, and 3 bit shifts.
+ while (ShiftLeft && ShiftAmt) {
+ // Shift one to the left.
+ for (ssize_t I = Regs.size() - 1; I >= 0; I--) {
+ Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ Register In = Regs[I].first;
+ Register InSubreg = Regs[I].second;
+ if (I == (ssize_t)Regs.size() - 1) { // first iteration
+ BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Out)
+ .addReg(In, 0, InSubreg)
+ .addReg(In, 0, InSubreg);
+ } else {
+ BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Out)
+ .addReg(In, 0, InSubreg)
+ .addReg(In, 0, InSubreg);
+ }
+ Regs[I] = std::pair(Out, 0);
+ }
+ ShiftAmt--;
+ }
+ while (!ShiftLeft && ShiftAmt) {
+ // Shift one to the right.
+ for (size_t I = 0; I < Regs.size(); I++) {
+ Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ Register In = Regs[I].first;
+ Register InSubreg = Regs[I].second;
+ if (I == 0) {
+ unsigned Opc = ArithmeticShift ? AVR::ASRRd : AVR::LSRRd;
+ BuildMI(*BB, MI, dl, TII.get(Opc), Out).addReg(In, 0, InSubreg);
+ } else {
+ BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), Out).addReg(In, 0, InSubreg);
+ }
+ Regs[I] = std::pair(Out, 0);
+ }
+ ShiftAmt--;
+ }
+
+ if (ShiftAmt != 0) {
+ llvm_unreachable("don't know how to shift!"); // sanity check
+ }
+}
+
+// Do a wide (32-bit) shift.
+MachineBasicBlock *
+AVRTargetLowering::insertWideShift(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const DebugLoc &dl = MI.getDebugLoc();
+
+ // How much to shift to the right (meaning: a negative number indicates a left
+ // shift).
+ int64_t ShiftAmt = MI.getOperand(4).getImm();
+ ISD::NodeType Opc;
+ switch (MI.getOpcode()) {
+ case AVR::Lsl32:
+ Opc = ISD::SHL;
+ break;
+ case AVR::Lsr32:
+ Opc = ISD::SRL;
+ break;
+ case AVR::Asr32:
+ Opc = ISD::SRA;
+ break;
+ }
+
+ // Read the input registers, with the most significant register at index 0.
+ std::array<std::pair<Register, int>, 4> Registers = {
+ std::pair(MI.getOperand(3).getReg(), AVR::sub_hi),
+ std::pair(MI.getOperand(3).getReg(), AVR::sub_lo),
+ std::pair(MI.getOperand(2).getReg(), AVR::sub_hi),
+ std::pair(MI.getOperand(2).getReg(), AVR::sub_lo),
+ };
+
+ // Do the shift. The registers are modified in-place.
+ insertMultibyteShift(MI, BB, Registers, Opc, ShiftAmt);
+
+ // Combine the 8-bit registers into 16-bit register pairs.
+ BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg())
+ .addReg(Registers[0].first, 0, Registers[0].second)
+ .addImm(AVR::sub_hi)
+ .addReg(Registers[1].first, 0, Registers[1].second)
+ .addImm(AVR::sub_lo);
+ BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg())
+ .addReg(Registers[2].first, 0, Registers[2].second)
+ .addImm(AVR::sub_hi)
+ .addReg(Registers[3].first, 0, Registers[3].second)
+ .addImm(AVR::sub_lo);
+
+ // Remove the pseudo instruction.
+ MI.eraseFromParent();
+ return BB;
+}
+
static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
if (I->getOpcode() == AVR::COPY) {
Register SrcReg = I->getOperand(1).getReg();
@@ -1901,6 +2066,10 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AVR::Asr8:
case AVR::Asr16:
return insertShift(MI, MBB);
+ case AVR::Lsl32:
+ case AVR::Lsr32:
+ case AVR::Asr32:
+ return insertWideShift(MI, MBB);
case AVR::MULRdRr:
case AVR::MULSRdRr:
return insertMul(MI, MBB);
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index 8d91c1bb996a..74092ca547e7 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -39,14 +39,17 @@ enum NodeType {
LSLBN, ///< Byte logical shift left N bits.
LSLWN, ///< Word logical shift left N bits.
LSLHI, ///< Higher 8-bit of word logical shift left.
+ LSLW, ///< Wide logical shift left.
LSR, ///< Logical shift right.
LSRBN, ///< Byte logical shift right N bits.
LSRWN, ///< Word logical shift right N bits.
LSRLO, ///< Lower 8-bit of word logical shift right.
+ LSRW, ///< Wide logical shift right.
ASR, ///< Arithmetic shift right.
ASRBN, ///< Byte arithmetic shift right N bits.
ASRWN, ///< Word arithmetic shift right N bits.
ASRLO, ///< Lower 8-bit of word arithmetic shift right.
+ ASRW, ///< Wide arithmetic shift right.
ROR, ///< Bit rotate right.
ROL, ///< Bit rotate left.
LSLLOOP, ///< A loop of single logical shift left instructions.
@@ -186,6 +189,8 @@ class AVRTargetLowering : public TargetLowering {
private:
MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *insertWideShift(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *insertCopyZero(MachineInstr &MI,
MachineBasicBlock *BB) const;
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 2a0f3208e42b..05ee94be7926 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -69,6 +69,9 @@ def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>;
def AVRlslwn : SDNode<"AVRISD::LSLWN", SDTIntBinOp>;
def AVRlsrwn : SDNode<"AVRISD::LSRWN", SDTIntBinOp>;
def AVRasrwn : SDNode<"AVRISD::ASRWN", SDTIntBinOp>;
+def AVRlslw : SDNode<"AVRISD::LSLW", SDTIntShiftDOp>;
+def AVRlsrw : SDNode<"AVRISD::LSRW", SDTIntShiftDOp>;
+def AVRasrw : SDNode<"AVRISD::ASRW", SDTIntShiftDOp>;
// Pseudo shift nodes for non-constant shift amounts.
def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
@@ -2337,6 +2340,11 @@ def Lsl16 : ShiftPseudo<(outs DREGS
: $src, i8
: $cnt))]>;
+def Lsl32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
+ (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
+ "# Lsl32 PSEUDO",
+ [(set i16:$dstlo, i16:$dsthi, (AVRlslw i16:$srclo, i16:$srchi, i8:$cnt))]>;
+
def Lsr8 : ShiftPseudo<(outs GPR8
: $dst),
(ins GPR8
@@ -2357,6 +2365,11 @@ def Lsr16 : ShiftPseudo<(outs DREGS
: $src, i8
: $cnt))]>;
+def Lsr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
+ (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
+ "# Lsr32 PSEUDO",
+ [(set i16:$dstlo, i16:$dsthi, (AVRlsrw i16:$srclo, i16:$srchi, i8:$cnt))]>;
+
def Rol8 : ShiftPseudo<(outs GPR8
: $dst),
(ins GPR8
@@ -2417,6 +2430,11 @@ def Asr16 : ShiftPseudo<(outs DREGS
: $src, i8
: $cnt))]>;
+def Asr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
+ (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
+ "# Asr32 PSEUDO",
+ [(set i16:$dstlo, i16:$dsthi, (AVRasrw i16:$srclo, i16:$srchi, i8:$cnt))]>;
+
// lowered to a copy from the zero register.
let usesCustomInserter=1 in
def CopyZero : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>;
diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll
new file mode 100644
index 000000000000..bc25e4098899
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/shift32.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=avr -mattr=movw -verify-machineinstrs | FileCheck %s
+
+define i32 @shl_i32_1(i32 %a) {
+; CHECK-LABEL: shl_i32_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 1
+ ret i32 %res
+}
+
+define i32 @shl_i32_2(i32 %a) {
+; CHECK-LABEL: shl_i32_2:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 2
+ ret i32 %res
+}
+
+; This is a special case: this shift is performed directly inside SelectionDAG
+; instead of as a custom lowering like the other shift operations.
+define i32 @shl_i32_16(i32 %a) {
+; CHECK-LABEL: shl_i32_16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movw r24, r22
+; CHECK-NEXT: ldi r22, 0
+; CHECK-NEXT: ldi r23, 0
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 16
+ ret i32 %res
+}
+
+; Combined with the register allocator, shift instructions can sometimes be
+; optimized away entirely. The least significant registers are simply stored
+; directly instead of moving them first.
+define void @shl_i32_16_ptr(i32 %a, ptr %ptr) {
+; CHECK-LABEL: shl_i32_16_ptr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movw r30, r20
+; CHECK-NEXT: std Z+2, r22
+; CHECK-NEXT: std Z+3, r23
+; CHECK-NEXT: ldi r24, 0
+; CHECK-NEXT: ldi r25, 0
+; CHECK-NEXT: st Z, r24
+; CHECK-NEXT: std Z+1, r25
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 16
+ store i32 %res, ptr %ptr
+ ret void
+}
+
+define i32 @lshr_i32_1(i32 %a) {
+; CHECK-LABEL: lshr_i32_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 1
+ ret i32 %res
+}
+
+define i32 @lshr_i32_2(i32 %a) {
+; CHECK-LABEL: lshr_i32_2:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: lsr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 2
+ ret i32 %res
+}
+
+define i32 @lshr_i32_16(i32 %a) {
+; CHECK-LABEL: lshr_i32_16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: ldi r24, 0
+; CHECK-NEXT: ldi r25, 0
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 16
+ ret i32 %res
+}
+
+define i32 @ashr_i32_1(i32 %a) {
+; CHECK-LABEL: ashr_i32_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: asr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 1
+ ret i32 %res
+}
+
+define i32 @ashr_i32_2(i32 %a) {
+; CHECK-LABEL: ashr_i32_2:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: asr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: asr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 2
+ ret i32 %res
+}
More information about the llvm-commits
mailing list