[llvm-bugs] [Bug 50082] New: [LV] Incorrect codegen for predicated loads/stores on targets supporting masking
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Apr 22 14:35:46 PDT 2021
https://bugs.llvm.org/show_bug.cgi?id=50082
Bug ID: 50082
Summary: [LV] Incorrect codegen for predicated loads/stores on
targets supporting masking
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Loop Optimizer
Assignee: unassignedbugs at nondot.org
Reporter: bmahjour at ca.ibm.com
CC: llvm-bugs at lists.llvm.org
Created attachment 24792
--> https://bugs.llvm.org/attachment.cgi?id=24792&action=edit
input IR
The LoopVectorize pass has a bug in handling predication that is hidden by the
cost-model, where we could end up generating bad code that executes predicated
scalar load/stores unconditionally if a VF=1 is chosen.
Consider a simple loop like this:
```
void foo(int * restrict A, int * restrict B, int n)
{
for (int i = 0; i < n; i++)
if (A[i])
A[i] = B[i] + i;
}
```
with the corresponding IR below:
```
target datalayout =
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: nounwind uwtable
define dso_local void @foo(i32* noalias %A, i32* noalias %B, i32 %n) #0 {
entry:
%cmp1 = icmp sgt i32 %n, 0
br i1 %cmp1, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.body: ; preds = %for.body.lr.ph,
%for.inc
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
%tobool.not = icmp eq i32 %0, 0
br i1 %tobool.not, label %for.inc, label %if.then
if.then: ; preds = %for.body
%arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
%1 = load i32, i32* %arrayidx2, align 4, !tbaa !2
%2 = trunc i64 %indvars.iv to i32
%add = add nsw i32 %1, %2
%arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
store i32 %add, i32* %arrayidx4, align 4, !tbaa !2
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.inc
br label %for.end
for.end: ; preds =
%for.cond.for.end_crit_edge, %entry
ret void
}
attributes #0 = { nounwind uwtable "target-cpu"="core-avx2"
"target-features"="+avx,+avx2" }
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
```
Running loop-vectorize pass with some options to make vectorizer make changes
(via interleaving) but not vectorize anything (forcing width to 1) as follows:
```
opt repro.ll -passes="loop-vectorize" -S -force-vector-width=1
-force-vector-interleave=2 -o out.ll
```
will generate `store` instructions that are not guarded and executed
unconditionally:
```
vector.body: ; preds = %vector.body,
%vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%induction = add i64 %index, 0
%induction1 = add i64 %index, 1
%0 = getelementptr inbounds i32, i32* %A, i64 %induction
%1 = getelementptr inbounds i32, i32* %A, i64 %induction1
%2 = load i32, i32* %0, align 4, !tbaa !0
%3 = load i32, i32* %1, align 4, !tbaa !0
%4 = icmp eq i32 %2, 0
%5 = icmp eq i32 %3, 0
%6 = getelementptr inbounds i32, i32* %B, i64 %induction
%7 = getelementptr inbounds i32, i32* %B, i64 %induction1
%8 = load i32, i32* %6, align 4, !tbaa !0
%9 = load i32, i32* %7, align 4, !tbaa !0
%10 = trunc i64 %index to i32
%induction2 = add i32 %10, 0
%induction3 = add i32 %10, 1
%11 = add nsw i32 %8, %induction2
%12 = add nsw i32 %9, %induction3
store i32 %11, i32* %0, align 4, !tbaa !0
store i32 %12, i32* %1, align 4, !tbaa !0
%index.next = add i64 %index, 2
%13 = icmp eq i64 %index.next, %n.vec
br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4
middle.block: ; preds = %vector.body
```
This is a problem on targets that support masked load/store instructions
nativly.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210422/d4bf3ed4/attachment.html>
More information about the llvm-bugs
mailing list