[llvm] fad5e0c - [AVR] Optimize 32-bit shifts: reverse shift + move
Ayke van Laethem via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 8 11:05:58 PST 2023
Author: Ayke van Laethem
Date: 2023-01-08T20:05:31+01:00
New Revision: fad5e0cf50f119f083dfc82e08994825cae5001f
URL: https://github.com/llvm/llvm-project/commit/fad5e0cf50f119f083dfc82e08994825cae5001f
DIFF: https://github.com/llvm/llvm-project/commit/fad5e0cf50f119f083dfc82e08994825cae5001f.diff
LOG: [AVR] Optimize 32-bit shifts: reverse shift + move
This optimization turns shifts of almost a multiple of 8 into a shift
into the opposite direction. Unfortunately it doesn't compose well with
the other optimizations (I've tried) so it's separate from them.
Differential Revision: https://reviews.llvm.org/D140572
Added:
Modified:
llvm/lib/Target/AVR/AVRISelLowering.cpp
llvm/test/CodeGen/AVR/shift32.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 7690a5fe6497..d2391b7bb003 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -1873,6 +1873,112 @@ static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg)
.addReg(STI.getZeroRegister());
+ // Do a shift modulo 6 or 7. This is a bit more complicated than most shifts
+ // and is hard to compose with the rest, so these are special cased.
+ // The basic idea is to shift one or two bits in the opposite direction and
+ // then move registers around to get the correct end result.
+ if (ShiftLeft && (ShiftAmt % 8) >= 6) {
+ // Left shift modulo 6 or 7.
+
+ // Create a slice of the registers we're going to modify, to ease working
+ // with them.
+ size_t ShiftRegsOffset = ShiftAmt / 8;
+ size_t ShiftRegsSize = Regs.size() - ShiftRegsOffset;
+ MutableArrayRef<std::pair<Register, int>> ShiftRegs =
+ Regs.slice(ShiftRegsOffset, ShiftRegsSize);
+
+ // Shift one to the right, keeping the least significant bit as the carry
+ // bit.
+ insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);
+
+ // Rotate the least significant bit from the carry bit into a new register
+ // (that starts out zero).
+ Register LowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), LowByte).addReg(ZeroReg);
+
+ // Shift one more to the right if this is a modulo-6 shift.
+ if (ShiftAmt % 8 == 6) {
+ insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);
+ Register NewLowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), NewLowByte).addReg(LowByte);
+ LowByte = NewLowByte;
+ }
+
+ // Move all registers to the left, zeroing the bottom registers as needed.
+ for (size_t I = 0; I < Regs.size(); I++) {
+ int ShiftRegsIdx = I + 1;
+ if (ShiftRegsIdx < (int)ShiftRegs.size()) {
+ Regs[I] = ShiftRegs[ShiftRegsIdx];
+ } else if (ShiftRegsIdx == (int)ShiftRegs.size()) {
+ Regs[I] = std::pair(LowByte, 0);
+ } else {
+ Regs[I] = std::pair(ZeroReg, 0);
+ }
+ }
+
+ return;
+ }
+
+ // Right shift modulo 6 or 7.
+ if (!ShiftLeft && (ShiftAmt % 8) >= 6) {
+ // Create a view on the registers we're going to modify, to ease working
+ // with them.
+ size_t ShiftRegsSize = Regs.size() - (ShiftAmt / 8);
+ MutableArrayRef<std::pair<Register, int>> ShiftRegs =
+ Regs.slice(0, ShiftRegsSize);
+
+ // Shift one to the left.
+ insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);
+
+ // Sign or zero extend the most significant register into a new register.
+ // The HighByte is the byte that still has one (or two) bits from the
+ // original value. The ExtByte is purely a zero/sign extend byte (all bits
+ // are either 0 or 1).
+ Register HighByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ Register ExtByte = 0;
+ if (ArithmeticShift) {
+ // Sign-extend bit that was shifted out last.
+ BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), HighByte)
+ .addReg(HighByte, RegState::Undef)
+ .addReg(HighByte, RegState::Undef);
+ ExtByte = HighByte;
+ // The highest bit of the original value is the same as the zero-extend
+ // byte, so HighByte and ExtByte are the same.
+ } else {
+ // Use the zero register for zero extending.
+ ExtByte = ZeroReg;
+ // Rotate most significant bit into a new register (that starts out zero).
+ BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), HighByte)
+ .addReg(ExtByte)
+ .addReg(ExtByte);
+ }
+
+ // Shift one more to the left for modulo 6 shifts.
+ if (ShiftAmt % 8 == 6) {
+ insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);
+ // Shift the topmost bit into the HighByte.
+ Register NewExt = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), NewExt)
+ .addReg(HighByte)
+ .addReg(HighByte);
+ HighByte = NewExt;
+ }
+
+ // Move all to the right, while sign or zero extending.
+ for (int I = Regs.size() - 1; I >= 0; I--) {
+ int ShiftRegsIdx = I - (Regs.size() - ShiftRegs.size()) - 1;
+ if (ShiftRegsIdx >= 0) {
+ Regs[I] = ShiftRegs[ShiftRegsIdx];
+ } else if (ShiftRegsIdx == -1) {
+ Regs[I] = std::pair(HighByte, 0);
+ } else {
+ Regs[I] = std::pair(ExtByte, 0);
+ }
+ }
+
+ return;
+ }
+
// For shift amounts of at least one register, simply rename the registers and
// zero the bottom registers.
while (ShiftLeft && ShiftAmt >= 8) {
diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll
index cc838101e63c..3b58c1e56214 100644
--- a/llvm/test/CodeGen/AVR/shift32.ll
+++ b/llvm/test/CodeGen/AVR/shift32.ll
@@ -78,6 +78,50 @@ define i32 @shl_i32_5(i32 %a) {
ret i32 %res
}
+; shift two to the right and move the registers around
+define i32 @shl_i32_6(i32 %a) {
+; CHECK-LABEL: shl_i32_6:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: mov r18, r1
+; CHECK-NEXT: ror r18
+; CHECK-NEXT: lsr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: ror r18
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: mov r19, r22
+; CHECK-NEXT: movw r22, r18
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 6
+ ret i32 %res
+}
+
+
+; shift one to the right and move registers around
+define i32 @shl_i32_7(i32 %a) {
+; CHECK-LABEL: shl_i32_7:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: mov r18, r1
+; CHECK-NEXT: ror r18
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: mov r19, r22
+; CHECK-NEXT: movw r22, r18
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 7
+ ret i32 %res
+}
+
define i32 @shl_i32_8(i32 %a) {
; CHECK-LABEL: shl_i32_8:
; CHECK: ; %bb.0:
@@ -128,6 +172,22 @@ define i32 @shl_i32_12(i32 %a) {
ret i32 %res
}
+define i32 @shl_i32_15(i32 %a) {
+; CHECK-LABEL: shl_i32_15:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movw r18, r22
+; CHECK-NEXT: lsr r24
+; CHECK-NEXT: ror r19
+; CHECK-NEXT: ror r18
+; CHECK-NEXT: mov r23, r1
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: mov r22, r1
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 15
+ ret i32 %res
+}
+
; This is a special case: this shift is performed directly inside SelectionDAG
; instead of as a custom lowering like the other shift operations.
define i32 @shl_i32_16(i32 %a) {
@@ -175,6 +235,21 @@ define i32 @shl_i32_28(i32 %a) {
ret i32 %res
}
+; move the rightmost bit to the leftmost bit and clear the rest
+define i32 @shl_i32_31(i32 %a) {
+; CHECK-LABEL: shl_i32_31:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsr r22
+; CHECK-NEXT: mov r25, r1
+; CHECK-NEXT: ror r25
+; CHECK-NEXT: mov r24, r1
+; CHECK-NEXT: mov r23, r1
+; CHECK-NEXT: mov r22, r1
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 31
+ ret i32 %res
+}
+
define i32 @lshr_i32_1(i32 %a) {
; CHECK-LABEL: lshr_i32_1:
; CHECK: ; %bb.0:
@@ -225,6 +300,49 @@ define i32 @lshr_i32_4(i32 %a) {
ret i32 %res
}
+define i32 @lshr_i32_6(i32 %a) {
+; CHECK-LABEL: lshr_i32_6:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: mov r19, r1
+; CHECK-NEXT: rol r19
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: rol r19
+; CHECK-NEXT: mov r18, r25
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 6
+ ret i32 %res
+}
+
+define i32 @lshr_i32_7(i32 %a) {
+; CHECK-LABEL: lshr_i32_7:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: mov r19, r1
+; CHECK-NEXT: rol r19
+; CHECK-NEXT: mov r18, r25
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 7
+ ret i32 %res
+}
+
define i32 @lshr_i32_8(i32 %a) {
; CHECK-LABEL: lshr_i32_8:
; CHECK: ; %bb.0:
@@ -280,6 +398,20 @@ define i32 @lshr_i32_24(i32 %a) {
ret i32 %res
}
+define i32 @lshr_i32_31(i32 %a) {
+; CHECK-LABEL: lshr_i32_31:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r25
+; CHECK-NEXT: mov r22, r1
+; CHECK-NEXT: rol r22
+; CHECK-NEXT: mov r25, r1
+; CHECK-NEXT: mov r24, r1
+; CHECK-NEXT: mov r23, r1
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 31
+ ret i32 %res
+}
+
define i32 @ashr_i32_1(i32 %a) {
; CHECK-LABEL: ashr_i32_1:
; CHECK: ; %bb.0:
@@ -333,6 +465,24 @@ define i32 @ashr_i32_4(i32 %a) {
ret i32 %res
}
+define i32 @ashr_i32_7(i32 %a) {
+; CHECK-LABEL: ashr_i32_7:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: sbc r19, r19
+; CHECK-NEXT: mov r18, r25
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 7
+ ret i32 %res
+}
+
; TODO: this could be optimized to 4 movs, instead of 6.
define i32 @ashr_i32_8(i32 %a) {
; CHECK-LABEL: ashr_i32_8:
@@ -375,3 +525,64 @@ define i32 @ashr_i32_17(i32 %a) {
%res = ashr i32 %a, 17
ret i32 %res
}
+
+define i32 @ashr_i32_22(i32 %a) {
+; CHECK-LABEL: ashr_i32_22:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: sbc r19, r19
+; CHECK-NEXT: lsl r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: mov r23, r19
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: mov r18, r19
+; CHECK-NEXT: mov r22, r25
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 22
+ ret i32 %res
+}
+
+define i32 @ashr_i32_23(i32 %a) {
+; CHECK-LABEL: ashr_i32_23:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: sbc r19, r19
+; CHECK-NEXT: mov r18, r19
+; CHECK-NEXT: mov r23, r19
+; CHECK-NEXT: mov r22, r25
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 23
+ ret i32 %res
+}
+
+define i32 @ashr_i32_30(i32 %a) {
+; CHECK-LABEL: ashr_i32_30:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r25
+; CHECK-NEXT: sbc r19, r19
+; CHECK-NEXT: lsl r25
+; CHECK-NEXT: mov r22, r19
+; CHECK-NEXT: rol r22
+; CHECK-NEXT: mov r18, r19
+; CHECK-NEXT: mov r23, r19
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 30
+ ret i32 %res
+}
+
+define i32 @ashr_i32_31(i32 %a) {
+; CHECK-LABEL: ashr_i32_31:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r25
+; CHECK-NEXT: sbc r23, r23
+; CHECK-NEXT: mov r22, r23
+; CHECK-NEXT: movw r24, r22
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 31
+ ret i32 %res
+}
More information about the llvm-commits
mailing list