<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/115169>115169</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[VPlan][LoopVectorize] Potential issue in how replicate recipe is lowered to IR
</td>
</tr>
<tr>
<th>Labels</th>
<td>
miscompilation,
vectorizers
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
fhahn
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
danilaml
</td>
</tr>
</table>
<pre>
It's a bit of a tricky one since I can no longer reproduce it on `main` due to changes in 5a4c6f97997f3cdfa9d98f7f0b546e331ee9cc4a , but I believe they didn't actually fix the underlying problem, but just made it (almost?) impossible to trigger the conditions for it.
As a workaround, I've introduced a cl opt for testing that forces replicate recipe over widen recipe so this can be triggered for regular instructions (since it's not possible to trigger it for loads after above changes):
```patch
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -357,6 +357,11 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
;
+static cl::opt<bool> ForceReplicateOverWiden(
+ "force-replicate-over-widen", cl::init(false), cl::Hidden,
+ cl::desc("Force replicate recipies over widen recipes."
+ "This flag should only be used for testing."));
+
namespace llvm {
cl::opt<bool> EnableVPlanNativePath(
"enable-vplan-native-path", cl::Hidden,
@@ -9148,8 +9153,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
continue;
}
- VPRecipeBase *Recipe =
- RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
+ VPRecipeBase *Recipe = nullptr;
+ if (!ForceReplicateOverWiden || isa<PHINode>(Instr))
+ Recipe =
+ RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
if (!Recipe)
Recipe = RecipeBuilder.handleReplication(Instr, Range);
```
With this patch and `-force-vector-width=4 -force-replicate-over-widen` options passed on a command line the following IR:
```llvm
define void @src(i1 %arg, i64 %arg1, ptr %arg2) {
bb:
br label %bb3
bb3: ; preds = %bb6, %bb
%phi = phi i64 [ %add, %bb6 ], [ 0, %bb ]
br i1 %arg, label %bb6, label %bb4
bb4: ; preds = %bb3
%load = load i32, ptr %arg2, align 8
%add5 = add i32 %load, 1
br label %bb6
bb6: ; preds = %bb4, %bb3
%phi7 = phi i32 [ %add5, %bb4 ], [ 0, %bb3 ]
%add = add i64 %phi, 1
%icmp = icmp samesign ult i64 %phi, %arg1
br i1 %icmp, label %bb3, label %bb8
bb8: ; preds = %bb6
%phi9 = phi i32 [ %phi7, %bb6 ]
ret void
}
```
Gets optimized to
```llvm
define void @src(i1 %arg, i64 %arg1, ptr %arg2) {
bb:
%0 = add i64 %arg1, 1
%min.iters.check = icmp ult i64 %0, 4
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %bb
%n.mod.vf = urem i64 %0, 4
%n.vec = sub i64 %0, %n.mod.vf
%broadcast.splatinsert = insertelement <4 x i1> poison, i1 %arg, i64 0
%broadcast.splat = shufflevector <4 x i1> %broadcast.splatinsert, <4 x i1> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %pred.load.continue6, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %pred.load.continue6 ]
%1 = xor <4 x i1> %broadcast.splat, <i1 true, i1 true, i1 true, i1 true>
%2 = extractelement <4 x i1> %1, i32 0
br i1 %2, label %pred.load.if, label %pred.load.continue
pred.load.if: ; preds = %vector.body
%3 = load i32, ptr %arg2, align 8
br label %pred.load.continue
pred.load.continue: ; preds = %pred.load.if, %vector.body
%4 = phi i32 [ poison, %vector.body ], [ %3, %pred.load.if ]
%5 = extractelement <4 x i1> %1, i32 1
br i1 %5, label %pred.load.if1, label %pred.load.continue2
pred.load.if1: ; preds = %pred.load.continue
%6 = load i32, ptr %arg2, align 8
br label %pred.load.continue2
pred.load.continue2: ; preds = %pred.load.if1, %pred.load.continue
%7 = phi i32 [ poison, %pred.load.continue ], [ %6, %pred.load.if1 ]
%8 = extractelement <4 x i1> %1, i32 2
br i1 %8, label %pred.load.if3, label %pred.load.continue4
pred.load.if3: ; preds = %pred.load.continue2
%9 = load i32, ptr %arg2, align 8
br label %pred.load.continue4
pred.load.continue4: ; preds = %pred.load.if3, %pred.load.continue2
%10 = phi i32 [ poison, %pred.load.continue2 ], [ %9, %pred.load.if3 ]
%11 = extractelement <4 x i1> %1, i32 3
br i1 %11, label %pred.load.if5, label %pred.load.continue6
pred.load.if5: ; preds = %pred.load.continue4
%12 = load i32, ptr %arg2, align 8
br label %pred.load.continue6
pred.load.continue6: ; preds = %pred.load.if5, %pred.load.continue4
%13 = phi i32 [ poison, %pred.load.continue4 ], [ %12, %pred.load.if5 ]
%14 = add i32 %4, 1
%15 = add i32 %7, 1
%16 = add i32 %10, 1
%17 = add i32 %13, 1
%18 = insertelement <4 x i32> poison, i32 %14, i32 0
%19 = insertelement <4 x i32> %18, i32 %15, i32 1
%20 = insertelement <4 x i32> %19, i32 %16, i32 2
%21 = insertelement <4 x i32> %20, i32 %17, i32 3
%predphi = select <4 x i1> %broadcast.splat, <4 x i32> zeroinitializer, <4 x i32> %21
%index.next = add nuw i64 %index, 4
%22 = icmp eq i64 %index.next, %n.vec
br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0
middle.block: ; preds = %pred.load.continue6
%23 = extractelement <4 x i32> %predphi, i32 3
%cmp.n = icmp eq i64 %0, %n.vec
br i1 %cmp.n, label %bb8, label %scalar.ph
scalar.ph: ; preds = %middle.block, %bb
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %bb ]
br label %bb3
bb3: ; preds = %scalar.ph, %bb6
%phi = phi i64 [ %add, %bb6 ], [ %bc.resume.val, %scalar.ph ]
br i1 %arg, label %bb6, label %bb4
bb4: ; preds = %bb3
%load = load i32, ptr %arg2, align 8
%add5 = add i32 %load, 1
br label %bb6
bb6: ; preds = %bb4, %bb3
%phi7 = phi i32 [ %add5, %bb4 ], [ 0, %bb3 ]
%add = add i64 %phi, 1
%icmp = icmp samesign ult i64 %phi, %arg1
br i1 %icmp, label %bb3, label %bb8, !llvm.loop !3
bb8: ; preds = %middle.block, %bb6
%phi9 = phi i32 [ %phi7, %bb6 ], [ %23, %middle.block ]
ret void
}
!0 = distinct !{!0, !1, !2}
!1 = !{!"llvm.loop.isvectorized", i32 1}
!2 = !{!"llvm.loop.unroll.runtime.disable"}
!3 = distinct !{!3, !2, !1}
```
Note the `pred.load.continue6` BB:
```llvm
pred.load.continue6: ; preds = %pred.load.if5, %pred.load.continue4
%13 = phi i32 [ poison, %pred.load.continue4 ], [ %12, %pred.load.if5 ]
%14 = add i32 %4, 1
%15 = add i32 %7, 1
%16 = add i32 %10, 1
%17 = add i32 %13, 1
%18 = insertelement <4 x i32> poison, i32 %14, i32 0
%19 = insertelement <4 x i32> %18, i32 %15, i32 1
%20 = insertelement <4 x i32> %19, i32 %16, i32 2
%21 = insertelement <4 x i32> %20, i32 %17, i32 3
%predphi = select <4 x i1> %broadcast.splat, <4 x i32> zeroinitializer, <4 x i32> %21
%index.next = add nuw i64 %index, 4
%22 = icmp eq i64 %index.next, %n.vec
br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0
```
All those adds are operating on a `poison` coming from phi while the predicate/mask is applied AFTER the operation is done, not before.
Before 5a4c6f97997f3cdfa9d98f7f0b546e331ee9cc4a this would happen for dereferenceable loads leading to load from poison ptr which is immediate ub. In a downstream project I have loads that are known to be dereferenceable unconditionally in the function body (so no ctx argument is needed) so for them this transformation still applies even after the above change.
To me it seems like the "correct" IR should've been to construct vector from %4, %7, %10, %13, then use it in a `predphi` and then do a vector (or scalar) add on that.
@fhahn , do you see any obvious problems with this? I've tried looking into why VPlan generates this IR but it's rather involved to my untrained eye.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWt9u47bSfxrmZhBDoizZvshFnN18DfChDXIW22tKGlvsUqQOSTmbffoDkpItyXL-tD1FCxxjsZHI4XBmfsPhkBpmDN9LxBuSbgmlu4pVklBK0k9XrLWV0jclk1ywWlzlqny5ebCErgwwyLkFtQMGVvPi2wsoiWC4LBAeoGASpAKh5B41aGy0KtsCwQ2RQLKoZlySLIKyRbAKiorJPRrgElK2LLLdZrXZrHZJUe7Yptysd6tdlKfLDJMkRtwUxZIBoXeQtxYeIEfB8YBgK3yBkpeS0JUFVtiWCfECO_7ddUErS9Tihcs9NFrlAuuexW-tsVCz0gtI6JqJWhlLkntCN8DrRhnDc-EltZrvnU6OYaFkyS1X0sBOaeB2QaJPJLq9deZ5Vvob06qVpZvlgdDVAYFLGyxRAoNCgGqsH2rRWCeXrZhvKNA4qwleMIugseANgjqghmdeouxbjAJbcePNnWMvG5aep8Z9K5gGLo3VbRHkJHQdMOIBRqkszKnHg1xCsdIA21nUwHJ1wB4pQjckuQ3qkiwK_xpmiyq0XV9fAyP0XohD7f7wnND7L5pJs1O6NoTef8XCKs1_IKH3_69Uc3xfFE3TMabb8A_yP8xqGZFlBNdJuiL0LgNCt-ExjqHrM5ZZXkAhnGLJrWosSe5ypQRJPsOjxh3qR42lh6R8wjKY9F8osLCErsM84H49ixJNceohlAYu0HRsHOIMdM8KVIOahScHNZOd5YVSDRg_0cItTbrx5t92qvUPcLTaq6rcO_d66p3rlwPqX51PHQV19naBwJFdH53w2ol0_RwoqXPpnjuXzpfWOyYMetFOXT_x0tPfnThPbUOpl2fq7RzNub8br_1QyC_O-3eC7cFUqhUlKCle3FJoTbcKupU1Zzfa202yGk3DCgTnY0BWfce8AT9Llgv8-iiY_JlZfsBHZqsRzugprg-NYPJaeprrxhPR18zT-egmXq4JvVs7J93EaeK8NOq9dOjf3lecGBJ14Gj1yxe1bbkovXi_clt9fXwKxiN0_fX-ya1eIDTzDy68DbRV0nLZ4smhwo-sPo0c7Do094y3zDiOt-ENSPKpJ-v6nTioF162O43Mone40Evo-sEFKGeXX5z_y9K4506-O_j6uN1OYHtlapCtEI3VI3L_4zvw_hZf8H8gqzuyugNuGEnuHn96-FmVSJLPJwG994yZAky1dl3_Fb1PCvQMNiOUhqJMJKiYLMVRZa7kcPLeEU4W68N5eHU-FHYZH96BydJt4NchPhy8K7q4YCuSfFrC9StxI4vchuf3oYYZt0KVdPugqmvHVXDp93DYKSHUs4uOD0_nu4zfB3xTiTs35KB46ZaH0S6i8BgITZneO-V4tuzeYvfaWN290pHr5_lxGoBcg2A5CkeZ50k3e0eXkOQWSLL1Mdx4W3uyzLH3Tz0bQtOm4p7C_fWSpFs_fVkeqTMg6Sf_lm4hOjb71sAo1zBSaSBbNnlfjmVdXpA1GYjoNnjf4x94Qnsznex0B0zwvYRTfHMapH4UK_2gnpMjjuftmI1lyy7ItjyaIBkbcnWypJvvaMn0OGA5b8pkYMtuzEn04B5NxUeSE5ryom48mX8wboNwNmiFnQzqveukdEDLjZvAk0ze12OTrC-61sgQmzlDOANNfKobpNH6BdLN1YfyySL_P7TGr82a_8ASrPpr1xyhaTQFpR8_RKXmcsEtarMoKiy-nQAa4OKRX57hMRk6gsIUTDC9aKpRawhtrnUA06lxHqyBsHJRq3Jx2PnOVmM9L6GnPGDhyUybj6gGbAb0uVasLJixC9MIl0Ya1DYYwz-iwBqla7lbwnfgsctaGsWNi_x3cIZWdJl3kKpqdzuBQfcx10vSeOHnp--bE-raf6BWLoPkTPAfqOeCR2dzd_qcgcI3z4HhXhYuKi36xKYP0hNkw4KXJX4_i9bRdMQwxvSjFhK_245wZtJJ_In9JN_fNmRnKx6D1S12wL3ymHwezEL9LPjdalbM-4MTxQ9P6MkBjsuFjpbCSSu-u9BxTB4HCI2GzUF0hqyXPZndkC7uRyNfea9Ap1z3Vc8J6l6UdHkWiwduPho08ZvkzF1cdjfyk_RDCJ5vQOlFBOM3IKSXMIzfuc4GWmR_KpYXJDt1v4VmfHGZDmQ-TzVGqJ6PnYCbzYAbT9Bdfwhdeobu-iK6yRvoLi-hO5_aXgTBq7H5U-G9INqp-y14z5fVnNBx9FGA6QThzQzC0zwzjj8EcXIGcXxpnfLdpbV92uguYJy-E-NhdhLTj4H8OsYXRDt1v4VxehGlkdDJRzFeTjCO6QzI6RTk5fQYtJzmrPHZSWl1RpJNSeLojGZ1RpOc0axfSQJDujXIAjsuy7MkwLVu3uTk5xsySs_2IpdHRO9itBkyys7CnmMUv4cRjYaMVmeLq4OzP5iHO9V3JmKvJK1nBF7gaXbp88QjirJ97lN93zs8FLjh9HS6wX-PKIf5pj86nGdv4_St5mUpcJELNTn4DLMazy92B72Fv28mNI6Ga3XE5L359gC_5LVweLRah84ccEXdLOScUaJXbeGHnZ28549_A3VPjXO6Ti06PfvlxUKjaWtcHJiYuwMKsoaRQ2ZvXgf9nuup0fl27k7hQ7dUU_U6quMkE2H_yOXVh36_96brcrbyD77p-ptddM0El-Q9t1-z6-x3Xoid3Jcml1beG5dm3Q1_l0CW3Fgu3f5BY7La-oAZFO2OGTE9DaVdPnikJZQe7bHg5tB_syy7j0RhJx2Mp6-Mb6VWQix0Ky2vcVFyw3KBjtOAQXJB6uQobS_9hVvCn5UNF_Qki-aifRbBdvvahf2FfO9jy_qfmgz-dbng_1LBt1PBf0gm-HdLBccBIfx_KwTYShl02hhgGvtKBrkPX_hcvAjelkVQqNp17LSq_ZJ8rrgIYaWvi0BC72tmvgE3wJpGcCzh9v7L5ydPdaqS4AZKJf0lqFQWctwpjV0Nzta_vL-aKHzifPZFBBVrGpS-fKBEX7KBskAXUbuCGIGs9OU6KmQTQRevoU8pniteVE48XtdYcmYR2nwBD84WpXqWxmpkNTRa_ea87gEqduh5-xIgZ8NvUj1LN0WOZ2K08lh75EucuAwfTlsZKknClSNdGwVSQWG_A9P71i8UbkAilm6f2YBRoUiiwjoYwPZ1NcHCxnIhOgwM4AH7khQ327AgqLP6FwW1L6IyiLUBwb91GwalhdLaV8pQeHjqyjVCUVSO6PUsVFepBN2XBm_WPoz2wfIYEY9hz1YooTV-Xt67W3eOyCL_wdqTlApYz5rQtdIQclZnCLcOlfTGX4w2-2XkK-J8tVmp4EW1Tjdg8gVUfuCqNX0xmYHn_lM5Se77ii-rnfcKpb45h-HSKniuXsCXZ8AepfNlNMH4D0--Gq0rzdLMVqiBy4MSB_9RDuoXaKXVjEssAV9wcVXeJOUm2bArvIlXSbSJ0iyOr6obytYFK-gqStdpFq93GS2XKZabLNmlMdsVV_yGRnQZx1EWp1GWRotynbP1bsXWeZatk3xJlhHWjIuFjwNK76-4MS3exHEaZ5srHzhMVzRYc1OouuGiqy5wMYdQesxptOkqCvWN43adt3tDlpHgxpoTf8ut8GWI3jhub023o3Iukn6CR2VRunALXhwHeKWez4vluAGhnn0xnFXw8HTVanFTWdsYl6DQe0Lv99xWbb4oVD2oLnPideuS0Hs_hyH0vtP6cEP_EwAA__-3KNMn">