[llvm] r369723 - [AlignmentFromAssumptions] getNewAlignmentDiff(): use getURemExpr()

Thu Aug 22 19:17:04 PDT 2019

Author: maskray
Date: Thu Aug 22 19:17:04 2019
New Revision: 369723

URL: http://llvm.org/viewvc/llvm-project?rev=369723&view=rev
Log:
[AlignmentFromAssumptions] getNewAlignmentDiff(): use getURemExpr()

The alignment is calculated incorrectly, thus sometimes it doesn't generate aligned mov instructions, as shown by the example below:

```
// b.cc
typedef long long index;

extern "C" index g_tid;
extern "C" index g_num;

void add3(float* __restrict__ a, float* __restrict__ b, float* __restrict__ c) {
    index n = 64*1024;
    index m = 16*1024;
    index k = 4*1024;
    index tid = g_tid;
    index num = g_num;
    __builtin_assume_aligned(a, 32);
    __builtin_assume_aligned(b, 32);
    __builtin_assume_aligned(c, 32);
    for (index i0=tid*k; i0<m; i0+=num*k)
        for (index i1=0; i1<n*m; i1+=m)
            for (index i2=0; i2<k; i2++)
                c[i1+i0+i2] = b[i0+i2] + a[i1+i0+i2];
}
```

Compile with `clang b.cc -Ofast -march=skylake -mavx2 -S`

```
vmovaps -224(%rdi,%rbx,4), %ymm0
vmovups -192(%rdi,%rbx,4), %ymm1         # should be movaps
vmovups -160(%rdi,%rbx,4), %ymm2         # should be movaps
vmovups -128(%rdi,%rbx,4), %ymm3         # should be movaps
vaddps  -224(%rsi,%rbx,4), %ymm0, %ymm0
vaddps  -192(%rsi,%rbx,4), %ymm1, %ymm1
vaddps  -160(%rsi,%rbx,4), %ymm2, %ymm2
vaddps  -128(%rsi,%rbx,4), %ymm3, %ymm3
vmovaps %ymm0, -224(%rdx,%rbx,4)
vmovups %ymm1, -192(%rdx,%rbx,4)         # should be movaps
vmovups %ymm2, -160(%rdx,%rbx,4)         # should be movaps
vmovups %ymm3, -128(%rdx,%rbx,4)         # should be movaps
```

Differential Revision: https://reviews.llvm.org/D66575
Patch by Dun Liang

Modified:
    llvm/trunk/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
    llvm/trunk/test/Transforms/AlignmentFromAssumptions/simple.ll

Modified: llvm/trunk/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp?rev=369723&r1=369722&r2=369723&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp Thu Aug 22 19:17:04 2019
@@ -93,9 +93,7 @@ static unsigned getNewAlignmentDiff(cons
                                     const SCEV *AlignSCEV,
                                     ScalarEvolution *SE) {
   // DiffUnits = Diff % int64_t(Alignment)
-  const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
-  const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
-  const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+  const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
 
   LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
                     << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");

Modified: llvm/trunk/test/Transforms/AlignmentFromAssumptions/simple.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/AlignmentFromAssumptions/simple.ll?rev=369723&r1=369722&r2=369723&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/AlignmentFromAssumptions/simple.ll (original)
+++ llvm/trunk/test/Transforms/AlignmentFromAssumptions/simple.ll Thu Aug 22 19:17:04 2019
@@ -90,6 +90,61 @@ for.end:
 ; CHECK: ret i32 %add.lcssa
 }
 
+; test D66575
+; def hoo2(a, id, num):
+;   for i0 in range(id*64, 4096, num*64):
+;     for i1 in range(0, 4096, 32):
+;       for i2 in range(0, 4096, 32):
+;         load(a, i0+i1+i2+32)
+define void @hoo2(i32* nocapture %a, i64 %id, i64 %num) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %id.mul = shl nsw i64 %id, 6
+  %num.mul = shl nsw i64 %num, 6
+  br label %for0.body
+
+for0.body:
+  %i0 = phi i64 [ %id.mul, %entry ], [ %i0.next, %for0.end ]
+  br label %for1.body
+
+for1.body:
+  %i1 = phi i64 [ 0, %for0.body ], [ %i1.next, %for1.end ]
+  br label %for2.body
+
+for2.body:
+  %i2 = phi i64 [ 0, %for1.body ], [ %i2.next, %for2.body ]
+
+  %t1 = add nuw nsw i64 %i0, %i1
+  %t2 = add nuw nsw i64 %t1, %i2
+  %t3 = add nuw nsw i64 %t2, 32
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %t3
+  %x = load i32, i32* %arrayidx, align 4
+
+  %i2.next = add nuw nsw i64 %i2, 32
+  %cmp2 = icmp ult i64 %i2.next, 4096
+  br i1 %cmp2, label %for2.body, label %for1.end
+
+for1.end:
+  %i1.next = add nuw nsw i64 %i1, 32
+  %cmp1 = icmp ult i64 %i1.next, 4096
+  br i1 %cmp1, label %for1.body, label %for0.end
+
+for0.end:
+  %i0.next = add nuw nsw i64 %i0, %num.mul
+  %cmp0 = icmp ult i64 %i0.next, 4096
+  br i1 %cmp0, label %for0.body, label %return
+
+return:
+  ret void
+
+; CHECK-LABEL: @hoo2
+; CHECK: load i32, i32* %arrayidx, align 32
+; CHECK: ret void
+}
+
 define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
 entry:
   %ptrint = ptrtoint i32* %a to i64