[PATCH] D48278: [SelectionDAG] Fold redundant masking operations of shifted value

Mon Jul 9 03:36:19 PDT 2018

dnsampaio marked 4 inline comments as done.
dnsampaio added a comment.

In https://reviews.llvm.org/D48278#1155201, @spatel wrote:

> This patch was reverted with https://reviews.llvm.org/rL336453 because it caused:
>  https://bugs.llvm.org/show_bug.cgi?id=38084

Sorry about that.

> Beyond that, I don't understand the motivation. The patch increases the latency of a computation. Why is that beneficial? The x86 diff doesn't look like a win to me.

It reduces the number of computation operations, from 3 to 2, and the number of constants kept int constants for performing the masking, from 2 to 1.
I don't see how it increases the latency. If you are going to perform the masking and the shift anyway.

> I don't know what the ARM/AArch output looked like before this patch. Always check in the baseline tests before making a code change, so we have that as a reference (and in case the patch is reverted, we won't lose the test coverage that motivated the code patch).

See in line comments for code change.

================
Comment at: lib/CodeGen/SelectionDAG/DAGCombiner.cpp:4115
+  const SDValue &SHIFT = AND->getOperand(0);
+  if ((SHIFT.getNumOperands() != 2) || (!SHIFT.hasOneUse()))
+    return SDValue();
----------------
samparker wrote:
> Why not check that you have a supported shift here and potentially exit early?
All tests are on the header of the function already (except for the case ISD::ROTL). Making a single big if statement just makes it harder to understand and harder to  someone add an exception of the invalid rule . Technically speaking, having one big test or one after the other is the same thing.

================
Comment at: lib/CodeGen/SelectionDAG/DAGCombiner.cpp:4180
+        N0Opcode = ISD::SRL;
+      /* fall-thru */
+    case ISD::SRL:
----------------
samparker wrote:
> use LLVM_FALLTHROUGH
Added  LLVM_FALLTHROUGH. But gcc 8.1 still gives warning, so I'm also keeping the /* fall-through */

/home/diosam01/LLVM/local/src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp: In member function ‘llvm::SDValue {anonymous}::DAGCombiner::foldRedundantShiftedMasks(llvm::SDNode*)’:
/home/diosam01/LLVM/local/src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp:4180:18: warning: this statement may fall through [-Wimplicit-fallthrough=]
         N0Opcode = ISD::SRL;
         ~~~~~~~~~^~~~~
/home/diosam01/LLVM/local/src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp:4182:5: note: here
     case ISD::SRL:
     ^~~~

================
Comment at: lib/CodeGen/SelectionDAG/DAGCombiner.cpp:4203
+          DAG.getNode(N0Opcode, DL, VT, ShiftTheAND, SHIFT.getOperand(1));
+      AddToWorklist(OtherUser);
+      return NewShift;
----------------
samparker wrote:
> I think you should be adding NewShift instead.
No, NewShift and its users is added to the work list automatically at DAGCombiner.cpp lines 1483 and 1484. In the log, you can see that a transformed node is always visited again in the same combining phase. I want the other user of this value, as it might be the solely user left.

================
Comment at: test/CodeGen/AArch64/FoldRedundantShiftedMasking.ll:3-12
+define i32 @ror(i32 %a) {
+entry:
+  %m2 = and i32 %a, 3855
+  %shl = shl i32 %a, 24
+  %shr = lshr i32 %a, 8
+  %or = or i32 %shl, %shr
+  %m1 = and i32 %or, 251658255
----------------
** Before  patch:
	mov	w9, #15
	mov	w8, #3855
	movk	w9, #3840, lsl #16
	and	w8, w0, w8
	and	w9, w9, w0, ror #8
	orr	w0, w9, w8
** After patch:
	mov	w8, #3855
	and	w8, w0, w8
	orr	w0, w8, w8, ror #8

================
Comment at: test/CodeGen/AArch64/FoldRedundantShiftedMasking.ll:18-26
+define i32 @shl(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+  %1 = and i32 %0, 172
+  %2 = shl i32 %0, 8
+	%3 = and i32 %2, 44032
+	%4 = or i32 %1, %3
----------------
** Before  patch:
	sxth	w8, w0
	mov	w9, #172
	mov	w10, #44032
	and	w9, w8, w9
	and	w8, w10, w8, lsl #8
	orr	w0, w9, w8
** After patch:
	mov	w8, #172
	and	w8, w0, w8
	orr	w0, w8, w8, lsl #8

================
Comment at: test/CodeGen/AArch64/FoldRedundantShiftedMasking.ll:18-26
+define i32 @shl(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+  %1 = and i32 %0, 172
+  %2 = shl i32 %0, 8
+	%3 = and i32 %2, 44032
+	%4 = or i32 %1, %3
----------------
dnsampaio wrote:
> ** Before  patch:
> 	sxth	w8, w0
> 	mov	w9, #172
> 	mov	w10, #44032
> 	and	w9, w8, w9
> 	and	w8, w10, w8, lsl #8
> 	orr	w0, w9, w8
> ** After patch:
> 	mov	w8, #172
> 	and	w8, w0, w8
> 	orr	w0, w8, w8, lsl #8
** In x86-64:
* Before patch:
movswl	%di, %eax
	movl	%eax, %ecx
	andl	$172, %ecx
	shll	$8, %eax
	andl	$44032, %eax            # imm = 0xAC00
	orl	%ecx, %eax

* After patch:
	andl	$172, %edi
	movl	%edi, %eax
	shll	$8, %eax
	leal	(%rax,%rdi), %eax

================
Comment at: test/CodeGen/AArch64/FoldRedundantShiftedMasking.ll:32-40
+define i32 @lshr(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+	%1 = and i32 %0, 44032
+  %2 = lshr i32 %0, 8
+  %3 = and i32 %2, 172
+	%4 = or i32 %1, %3
----------------
** Before  patch:
	sxth	w8, w0
	mov	w9, #44032
	mov	w10, #172
	and	w9, w8, w9
	and	w8, w10, w8, lsr #8
	orr	w0, w9, w8
** After patch:
	mov	w8, #44032
	and	w8, w0, w8
	orr	w0, w8, w8, lsr #8

================
Comment at: test/CodeGen/AArch64/FoldRedundantShiftedMasking.ll:32-40
+define i32 @lshr(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+	%1 = and i32 %0, 44032
+  %2 = lshr i32 %0, 8
+  %3 = and i32 %2, 172
+	%4 = or i32 %1, %3
----------------
dnsampaio wrote:
> ** Before  patch:
> 	sxth	w8, w0
> 	mov	w9, #44032
> 	mov	w10, #172
> 	and	w9, w8, w9
> 	and	w8, w10, w8, lsr #8
> 	orr	w0, w9, w8
> ** After patch:
> 	mov	w8, #44032
> 	and	w8, w0, w8
> 	orr	w0, w8, w8, lsr #8
** In x86-64
* Before patch:
	movswl	%di, %eax
	movl	%eax, %ecx
	andl	$44032, %ecx            # imm = 0xAC00
	shrl	$8, %eax
	andl	$172, %eax
	orl	%ecx, %eax

* After patch:
andl	$44032, %edi            # imm = 0xAC00
	movl	%edi, %eax
	shrl	$8, %eax
	leal	(%rax,%rdi), %eax

================
Comment at: test/CodeGen/AArch64/FoldRedundantShiftedMasking.ll:46-54
+define i32 @ashr(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+	%1 = and i32 %0, 44032
+  %2 = ashr i32 %0, 8
+  %3 = and i32 %2, 172
+	%4 = or i32 %1, %3
----------------
** Before  patch:
	sxth	w8, w0
	mov	w9, #44032
	mov	w10, #172
	and	w9, w8, w9
	and	w8, w10, w8, lsr #8
	orr	w0, w9, w8
** After patch:
	mov	w8, #44032
	and	w8, w0, w8
	orr	w0, w8, w8, lsr #8

================
Comment at: test/CodeGen/AArch64/FoldRedundantShiftedMasking.ll:46-54
+define i32 @ashr(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+	%1 = and i32 %0, 44032
+  %2 = ashr i32 %0, 8
+  %3 = and i32 %2, 172
+	%4 = or i32 %1, %3
----------------
dnsampaio wrote:
> ** Before  patch:
> 	sxth	w8, w0
> 	mov	w9, #44032
> 	mov	w10, #172
> 	and	w9, w8, w9
> 	and	w8, w10, w8, lsr #8
> 	orr	w0, w9, w8
> ** After patch:
> 	mov	w8, #44032
> 	and	w8, w0, w8
> 	orr	w0, w8, w8, lsr #8
** In x86-64
* Before patch:
movswl	%di, %eax
	movl	%eax, %ecx
	andl	$44032, %ecx            # imm = 0xAC00
	shrl	$8, %eax
	andl	$172, %eax
	orl	%ecx, %eax
* After patch:
andl	$44032, %edi            # imm = 0xAC00
	movl	%edi, %eax
	shrl	$8, %eax
	leal	(%rax,%rdi), %eax

================
Comment at: test/CodeGen/ARM/FoldRedundantShiftedMasking.ll:22
+; ARM-NEXT:	mov	pc, lr
+
+define i32 @shl(i16 %a) {
----------------
** Before patch:
	mov	r1, #15
	mov	r2, #15
	orr	r1, r1, #3840
	orr	r2, r2, #251658240
	and	r1, r0, r1
	and	r0, r2, r0, ror #8
	orr	r0, r0, r1
** After patch:
	mov	r1, #15
	orr	r1, r1, #3840
	and	r0, r0, r1
	orr	r0, r0, r0, ror #8

================
Comment at: test/CodeGen/ARM/FoldRedundantShiftedMasking.ll:23-34
+define i32 @shl(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+  %1 = and i32 %0, 172
+  %2 = shl i32 %0, 8
+	%3 = and i32 %2, 44032
+	%4 = or i32 %1, %3
----------------
** Before patch:
	lsl	r0, r0, #16
	mov	r1, #172
	and	r0, r1, r0, asr #16
	orr	r0, r0, r0, lsl #8
** After patch:
and	r0, r0, #172
	orr	r0, r0, r0, lsl #8

================
Comment at: test/CodeGen/ARM/FoldRedundantShiftedMasking.ll:36-44
+define i32 @lshr(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+	%1 = and i32 %0, 44032
+  %2 = lshr i32 %0, 8
+  %3 = and i32 %2, 172
+	%4 = or i32 %1, %3
----------------
** Before patch:
lsl	r0, r0, #16
	mov	r1, #44032
	and	r1, r1, r0, asr #16
	asr	r0, r0, #16
	mov	r2, #172
	and	r0, r2, r0, lsr #8
	orr	r0, r1, r0
** After patch:
	and	r0, r0, #44032
	orr	r0, r0, r0, lsr #8

================
Comment at: test/CodeGen/ARM/FoldRedundantShiftedMasking.ll:49-57
+define i32 @ashr(i16 %a) {
+entry:
+	%0 = sext i16 %a to i32
+	%1 = and i32 %0, 44032
+  %2 = ashr i32 %0, 8
+  %3 = and i32 %2, 172
+	%4 = or i32 %1, %3
----------------
** Before
	lsl	r0, r0, #16
	mov	r1, #44032
	and	r1, r1, r0, asr #16
	asr	r0, r0, #16
	mov	r2, #172
	and	r0, r2, r0, lsr #8
	orr	r0, r1, r0
** After
	and	r0, r0, #44032
	orr	r0, r0, r0, lsr #8

https://reviews.llvm.org/D48278