<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - [LV] Incorrect codegen for predicated loads/stores on targets supporting masking"
href="https://bugs.llvm.org/show_bug.cgi?id=50082">50082</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>[LV] Incorrect codegen for predicated loads/stores on targets supporting masking
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Loop Optimizer
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>bmahjour@ca.ibm.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>Created <span class=""><a href="attachment.cgi?id=24792" name="attach_24792" title="input IR">attachment 24792</a> <a href="attachment.cgi?id=24792&action=edit" title="input IR">[details]</a></span>
input IR
The LoopVectorize pass has a bug in handling predication that is hidden by the
cost-model, where we could end up generating bad code that executes predicated
scalar load/stores unconditionally if a VF=1 is chosen.
Consider a simple loop like this:
```
void foo(int * restrict A, int * restrict B, int n)
{
for (int i = 0; i < n; i++)
if (A[i])
A[i] = B[i] + i;
}
```
with the corresponding IR below:
```
target datalayout =
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: nounwind uwtable
define dso_local void @foo(i32* noalias %A, i32* noalias %B, i32 %n) #0 {
entry:
%cmp1 = icmp sgt i32 %n, 0
br i1 %cmp1, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.body: ; preds = %for.body.lr.ph,
%for.inc
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
%tobool.not = icmp eq i32 %0, 0
br i1 %tobool.not, label %for.inc, label %if.then
if.then: ; preds = %for.body
%arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
%1 = load i32, i32* %arrayidx2, align 4, !tbaa !2
%2 = trunc i64 %indvars.iv to i32
%add = add nsw i32 %1, %2
%arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
store i32 %add, i32* %arrayidx4, align 4, !tbaa !2
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.inc
br label %for.end
for.end: ; preds =
%for.cond.for.end_crit_edge, %entry
ret void
}
attributes #0 = { nounwind uwtable "target-cpu"="core-avx2"
"target-features"="+avx,+avx2" }
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
```
Running loop-vectorize pass with some options to make vectorizer make changes
(via interleaving) but not vectorize anything (forcing width to 1) as follows:
```
opt repro.ll -passes="loop-vectorize" -S -force-vector-width=1
-force-vector-interleave=2 -o out.ll
```
will generate `store` instructions that are not guarded and executed
unconditionally:
```
vector.body: ; preds = %vector.body,
%vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%induction = add i64 %index, 0
%induction1 = add i64 %index, 1
%0 = getelementptr inbounds i32, i32* %A, i64 %induction
%1 = getelementptr inbounds i32, i32* %A, i64 %induction1
%2 = load i32, i32* %0, align 4, !tbaa !0
%3 = load i32, i32* %1, align 4, !tbaa !0
%4 = icmp eq i32 %2, 0
%5 = icmp eq i32 %3, 0
%6 = getelementptr inbounds i32, i32* %B, i64 %induction
%7 = getelementptr inbounds i32, i32* %B, i64 %induction1
%8 = load i32, i32* %6, align 4, !tbaa !0
%9 = load i32, i32* %7, align 4, !tbaa !0
%10 = trunc i64 %index to i32
%induction2 = add i32 %10, 0
%induction3 = add i32 %10, 1
%11 = add nsw i32 %8, %induction2
%12 = add nsw i32 %9, %induction3
store i32 %11, i32* %0, align 4, !tbaa !0
store i32 %12, i32* %1, align 4, !tbaa !0
%index.next = add i64 %index, 2
%13 = icmp eq i64 %index.next, %n.vec
br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4
middle.block: ; preds = %vector.body
```
This is a problem on targets that support masked load/store instructions
nativly.</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>