<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/62960>62960</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[SVE] Sink Extends when they feed masked.gather/masked.scatter intrinsics
</td>
</tr>
<tr>
<th>Labels</th>
<td>
backend:AArch64
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
paulwalker-arm
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
mcinally
</td>
</tr>
</table>
<pre>
Masked gather/scatter intrinsics could generate better code if the index vector's type is sunk into the gather/scatter's block.
For example:
```
; llc test.llvm -O3 -mcpu=neoverse-v1 -aarch64-sve-vector-bits-min=256 -aarch64-sve-vector-bits-max=256 -o test.s
target triple = "aarch64-unknown-linux-gnu"
define void @vector_(ptr %a, ptr %b, ptr %c, ptr %j) {
L.entry:
%0 = load i32, ptr %j, align 4
%1 = insertelement <8 x i32> poison, i32 %0, i64 0
%2 = shufflevector <8 x i32> %1, <8 x i32> poison, <8 x i32> zeroinitializer
%3 = mul <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%4 = sext i32 %0 to i64
%5 = shl nsw i64 %4, 5
%6 = getelementptr i8, ptr %b, i64 -4
%7 = shl nsw i64 %4, 2
%8 = getelementptr i8, ptr %a, i64 2
%9 = sext <8 x i32> %3 to <8 x i64>
br label %L.LB1_509
L.LB1_509: ; preds = %L.LB1_509, %L.entry
%x = phi ptr [ null, %L.entry ], [ %21, %L.LB1_509 ]
%y = phi i32 [ 9, %L.entry ], [ %22, %L.LB1_509 ]
%z = phi i32 [ 0, %L.entry ], [ %20, %L.LB1_509 ]
%10 = zext i32 %z to i64
%11 = shl nuw nsw i64 %10, 2
%12 = getelementptr i8, ptr %c, i64 %11
%13 = load <8 x float>, ptr %12, align 4
%14 = ptrtoint ptr %x to i64
%15 = getelementptr i8, ptr %8, i64 %14
%16 = getelementptr float, ptr %15, <8 x i64> %9
%17 = tail call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %16, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
%18 = fadd fast <8 x float> %17, %13
%19 = getelementptr i8, ptr %a, i64 %11
store <8 x float> %18, ptr %19, align 1
%20 = add nuw nsw i32 %z, 8
%21 = getelementptr i8, ptr %x, i64 %5
%22 = add nsw i32 %y, -8
%23 = icmp ugt i32 %y, 8
br i1 %23, label %L.LB1_509, label %L.LB1_515
L.LB1_515: ; preds = %L.LB1_509
ret void
}
declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x float>)
```
This currently generates:
```
mul z1.s, p0/m, z1.s, z0.s
sunpklo z0.d, z1.s
ext z1.b, z1.b, z1.b, #16
sunpklo z1.d, z1.s
.LBB0_1:
ld1w { z2.s }, p0/z, [x2]
ld1w { z3.d }, p1/z, [x9, z0.d, lsl #2]
ld1w { z4.d }, p1/z, [x9, z1.d, lsl #2]
sub w10, w10, #8
add x9, x9, x8
add x2, x2, #32
cmp w10, #8
uzp1 z3.s, z3.s, z3.s
uzp1 z4.s, z4.s, z4.s
splice z3.s, p2, z3.s, z4.s
fadd z2.s, z3.s, z2.s
st1w { z2.s }, p0, [x0]
add x0, x0, #32
b.hi .LBB0_1
```
The pointer arithmetic extend creates a double wide vector for the index in the header, which later needs to be unzipped and spliced to recover the <8 x i32> vector in the loop.
But if we sink that extend into the loop, then ISel will pick up the index vector as <8 x i32> and do the right thing. E.g.:
```
mul z0.s, p0/m, z0.s, z1.s
.LBB0_1:
ld1w { z1.s }, p0/z, [x2]
ld1w { z2.s }, p0/z, [x9, z0.s, sxtw #2]
add x9, x9, x8
sub w10, w10, #8
add x2, x2, #32
cmp w10, #8
fadd z1.s, z2.s, z1.s
st1w { z1.s }, p0, [x0]
add x0, x0, #32
b.hi .LBB0_1
```
I prototyped this with the following patch, but obviously it is a big hammer since it is sinking *all* extends and not just those that feed masked gather/scatter intrinsics. I'm not sure where this selective sinking should be done, so I'll turn it over to the experts at ARM for a real solution.
```
index dd431cc6f4f5..bfe1b9b60c1b 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8058,9 +8058,12 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
/// Sink a zext or sext into its user blocks if the target type doesn't
/// fit in one register
- if (TLI->getTypeAction(CI->getContext(),
- TLI->getValueType(*DL, CI->getType())) ==
- TargetLowering::TypeExpandInteger) {
+ EVT VT = TLI->getValueType(*DL, CI->getType());
+ TargetLowering::LegalizeTypeAction Action =
+ TLI->getTypeAction(CI->getContext(), VT);
+
+ if (Action == TargetLowering::TypeExpandInteger ||
+ Action == TargetLowering::TypeSplitVector) {
return SinkCast(CI);
} else {
if (TLI->optimizeExtendOrTruncateConversion(
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8WMtu6zjSfhpmU7AgUZZjLbKIneRHgDT6x5zgLGbToKSyxQ5NCiQVX55-QFKyaTuX0weDMQyLFqs-VhW_Kl6YMXwtEe9IsSCUdqwXWybeUE-Y3hBKSfFww3rbKn23qblkQuxvKtXs7_5g5g0bWDPboib0ydTMWtTApdVcGl4bqFUvGlijRM0sQoVeoFYNAl-BbRG4bHAH71hbpQm9NWD3HQI3YHr55qCUF7scxItWQtVvCUkfSHoffp-UBtyxTSeQ5PdxD5mlwzf8zRcgRA0WjU2EeN_A5M8cJpu660n-IFG9ozY4ec9gwpiu29l0Yt5xEuycVNyayYZLkj_QYvaFCNuNIioMZcLwluk1WrCadwKB5A9AKB1Revkm1VZOBJf9brKWvZuEyJUGV1wivCveAJmmYcC_CJ13VgOhBSN0CUO7itp11P6b0BLI7SIgviQord4fQwZOJPV2CcUa4Dk9110CE3wtYRrJZ16eS4PaosANSgskX85h5_XzR-gUN0o6bZ5TP4Rvz6YwzIp7Rz2MafvVSmDw7QLGjeUUPwM_f39ArbjkljPBD6gjg3M_0qYX1_h0wHF2pqPB2digYyMfG9OxUYyN2di4JfljNOg0uIc7ewwCWOWCEAkVQwwESLP1AXKaDrGIpGZeao1jtN308Pnl5DvtSQx--yk4jaTm34GzETzWKk_eXcU0d36Ob2fTKCyVBsEqFE7qJXlZZH8VaRkz_vQyvweXup3Gxgx5E6m4SXP_A5tPZu28aNfyYHyxANkLcS4OpHjwb4qFZ0B27B7QvcAJc3_E9PNYLKD8GpB-A3i4Aky_Bky_AcxCBh8irh2uuZZlJz7025gTWXpJiox-x4p6ZIVHjlXzUzkZSLASillHg5N6Rj-pLSFtOqut4tKO4rsP3Cm-M3Eem3im-lFCBSMjC4u4xHgae-LHOCHDLOMCaibEpb-uaLslJ9n4BTQJa1vyPl_lNHmfdymh80Gls3qseLOzWjMakPnufMkzsLpHL_Nfa4apubS-lw2uCD3zOFSLFWsaWDFjrz12URnomuWxZvnLdeaMUcYqjR8OE-tm5YlOMRlpyAxn7pHzQ4I4hXksmn1n4C4yMC7PlJ4GOQ2wd9KTsxFCZvB600G_tmeC86hE8ixIu_cflMsP3mbFB0U0K74posOIGq3fYgwQtw_nW5BasI8m4B8ye-Q032yYXl8S-wP2HXl3uZ_zv68tN1D3WqO0Yn_cd5pv9oJpuekFSctDlhg_sSmhTxvXGt8c0nHnRtLS9LJ7E8oppElzFBu7cWcDVjV0nT0JzbPZR1DZJVTyslikf2WR7aVosi1JS3K7gANNDLhZGe09DEvDjh5XgQuNPGmOGlmsUQ4uegOEcRTKP0WZfo2SfY5i-oqk5TYsK8OD0Hx-FGBNQ9IyAA2_V51-gdgNa2meHzfGZb3pTuhnsP2hy1yE82Eu4-elzHToi59H-zvBa4yQOnqGF8uugrluls5kaIxnP53NENE0jt7gv-_cpdf-V0nLSVqOrPkqSdBtlqU7iTHNbbtBy2vAnUXZQK3RZQwwaFRfCYQtb3A4n8FK6ejQxqX_0yJr3IFsCduW1y0I5pAluuJiFVQIvTzwrsMGmGwgRLFxXRprd87yKOfbxWG8YQShVJdA7MOit-4AuUUwXL6BbZkdHTgeGZ2Ws8q2KOH5BwrYciGg4_Ub9N3V4ROYuTDCWdsELM3XrQXbcrlO4DFZJ79aUdKrijK8-eU8z_5xnn9aGcrYALOz26sU_TIDfzV_fyNFx3TJojS5rKtxumT_u3R5hk4rq-zeEdi6BWbLbetZsVJCqC2Xa-iYrVs3SNVbUNU7V70Re-AWuEuliq-hZZsNasfXGocOx12nTeg9c8eR-4HDxlNPKgt_98bRThkMHF8hNrD59t4lgWdCbzcewvQaYduixmC9QYG15e94HN60_pamQmiU9BtAozyAEGB7LZ21IU9DMuCuQ20NMAv3__rDFwUGGpkAo0RvuZLJF7kRcq5ppnlW17PVdFUkSbXCrCqrWVpnFWRpOpsO-_LJZAKM0Ce3qXAPXhH6tFQN_h_KU-v_NXZMY1J33TAkXYQvVL-hPE3JNIXJPPVHhRIIXQxtd_4JvZVSAs4RXALn96qzfMMP-CyNJXTuHrqvXVDcND-76P6hGr7aP7wCoTPf5tg8vJ5dyLjiRuicG0by5b8fd9bDue3P_DkILsntEkL_j6v-c6zwIfQpfOGHK5ksnAuVHu4iXNXk1kBvUIeLNTNe0Y2XVfvOMQSNJPTWfoK9csyWoCSCxjU3drxxmQTJ4Nfry_OE5I9rtK_7Du99eAidL8e3SyWt20bRuXdmOUKc9H4y0aNT9jL3Dy8usMsYdVQOwcgf3HeE8Q69qC1qLtdh3pzK465jsnmWFtcusaIYOir5z-PPV_j56jfNv2dMHiF-ZMcLrv1F1SkwMDyODnjVfxhB-Pl6MfqFY2FioqGch78Sp4GLF3Du82tgPzrB7c_h_veatv4g4oqQY-2S-aRaPse-uIUAUBj8QPeMbmNqPvoi-6d-1b2smcWlku-oTYjgRc26ae7ypsxLdoN32WxeZPmc3k5v2rtiTlfzaVEUWV5ndd1ks6ZJy2m2ymk5zcv0ht_RlOZpQWc0K26LaVLP6HTOcFrM05KWKZJpihvGhb9-TpRe33Bjeryb0XKW3viznBnu5CtWv6FsSH5_f-_viIdreX3ndCdVvzbu5MWNNSc0y63wl_o_fj6S4iGk_eOwwGzdpsi2uI-XlOS4pAz_r1eWm16Lu9bazp-pfM6vuW37KqnVJqq1zqxOq7-xtoQ-eccMoU_et_8EAAD__0x8wlw">