[llvm-bugs] [Bug 50082] New: [LV] Incorrect codegen for predicated loads/stores on targets supporting masking

Thu Apr 22 14:35:46 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=50082

            Bug ID: 50082
           Summary: [LV] Incorrect codegen for predicated loads/stores on
                    targets supporting masking
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: bmahjour at ca.ibm.com
                CC: llvm-bugs at lists.llvm.org

Created attachment 24792
  --> https://bugs.llvm.org/attachment.cgi?id=24792&action=edit
input IR

The LoopVectorize pass has a bug in handling predication that is hidden by the
cost-model, where we could end up generating bad code that executes predicated
scalar load/stores unconditionally if a VF=1 is chosen.

Consider a simple loop like this:
```
void foo(int * restrict A, int * restrict B, int n)
{
  for (int i = 0; i < n; i++)
    if (A[i])
      A[i] = B[i] + i;
}
```

with the corresponding IR below:

```
target datalayout =
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: nounwind uwtable
define dso_local void @foo(i32* noalias %A, i32* noalias %B, i32 %n) #0 {
entry:
  %cmp1 = icmp sgt i32 %n, 0
  br i1 %cmp1, label %for.body.lr.ph, label %for.end

for.body.lr.ph:                                   ; preds = %entry
  %wide.trip.count = zext i32 %n to i64
  br label %for.body

for.body:                                         ; preds = %for.body.lr.ph,
%for.inc
  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
  %tobool.not = icmp eq i32 %0, 0
  br i1 %tobool.not, label %for.inc, label %if.then

if.then:                                          ; preds = %for.body
  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
  %1 = load i32, i32* %arrayidx2, align 4, !tbaa !2
  %2 = trunc i64 %indvars.iv to i32
  %add = add nsw i32 %1, %2
  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  store i32 %add, i32* %arrayidx4, align 4, !tbaa !2
  br label %for.inc

for.inc:                                          ; preds = %for.body, %if.then
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge

for.cond.for.end_crit_edge:                       ; preds = %for.inc
  br label %for.end

for.end:                                          ; preds =
%for.cond.for.end_crit_edge, %entry
  ret void
}

attributes #0 = { nounwind uwtable "target-cpu"="core-avx2"
"target-features"="+avx,+avx2" }

!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
```

Running loop-vectorize pass with some options to make vectorizer make changes
(via interleaving) but not vectorize anything (forcing width to 1) as follows:
```
opt repro.ll -passes="loop-vectorize" -S -force-vector-width=1
-force-vector-interleave=2 -o out.ll
```

will generate `store` instructions that are not guarded and executed
unconditionally:

```
vector.body:                                      ; preds = %vector.body,
%vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %induction = add i64 %index, 0
  %induction1 = add i64 %index, 1
  %0 = getelementptr inbounds i32, i32* %A, i64 %induction
  %1 = getelementptr inbounds i32, i32* %A, i64 %induction1
  %2 = load i32, i32* %0, align 4, !tbaa !0
  %3 = load i32, i32* %1, align 4, !tbaa !0
  %4 = icmp eq i32 %2, 0
  %5 = icmp eq i32 %3, 0
  %6 = getelementptr inbounds i32, i32* %B, i64 %induction
  %7 = getelementptr inbounds i32, i32* %B, i64 %induction1
  %8 = load i32, i32* %6, align 4, !tbaa !0
  %9 = load i32, i32* %7, align 4, !tbaa !0
  %10 = trunc i64 %index to i32
  %induction2 = add i32 %10, 0
  %induction3 = add i32 %10, 1
  %11 = add nsw i32 %8, %induction2
  %12 = add nsw i32 %9, %induction3
  store i32 %11, i32* %0, align 4, !tbaa !0
  store i32 %12, i32* %1, align 4, !tbaa !0
  %index.next = add i64 %index, 2
  %13 = icmp eq i64 %index.next, %n.vec
  br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4

middle.block:                                     ; preds = %vector.body
```

This is a problem on targets that support masked load/store instructions
nativly.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210422/d4bf3ed4/attachment.html>