<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/142699>142699</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[opt] unrolling going out of bound for nvptx64-nvidia-cuda
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
clementval
</td>
</tr>
</table>
<pre>
For this input LLVM IR.
```
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
define ptx_kernel void @_QMmPpartialsumshflshflr4(ptr %0, ptr %1, i32 %2) {
%4 = alloca i32, i64 1, align 4
%5 = alloca float, i64 1, align 4
%6 = alloca float, i64 1, align 4
%7 = alloca i32, i64 1, align 4
%8 = alloca i32, i64 1, align 4
%9 = alloca i32, i64 1, align 4
%10 = alloca i32, i64 1, align 4
%11 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%12 = add i32 %11, 1
%13 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%14 = add i32 %13, 1
%15 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
%16 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
store i32 %2, ptr %10, align 4
%17 = load i32, ptr %10, align 4
%18 = sext i32 %17 to i64
%19 = icmp sgt i64 %18, 0
%20 = select i1 %19, i64 %18, i64 0
%21 = mul i32 %15, %16
store i32 %21, ptr %7, align 4
store float 0.000000e+00, ptr %5, align 4
%22 = mul i32 %13, %16
%23 = add i32 %22, %12
store i32 %23, ptr %8, align 4
%24 = load i32, ptr %8, align 4
%25 = sext i32 %24 to i64
%26 = load i32, ptr %10, align 4
%27 = sext i32 %26 to i64
%28 = load i32, ptr %7, align 4
%29 = sext i32 %28 to i64
%30 = trunc i64 %25 to i32
%31 = sub i64 %27, %25
%32 = add i64 %31, %29
%33 = sdiv i64 %32, %29
br label %34
34: ; preds = %38, %3
%35 = phi i32 [ %52, %38 ], [ %30, %3 ]
%36 = phi i64 [ %53, %38 ], [ %33, %3 ]
%37 = icmp sgt i64 %36, 0
br i1 %37, label %38, label %54
38: ; preds = %34
store i32 %35, ptr %9, align 4
%39 = load float, ptr %5, align 4
%40 = load i32, ptr %9, align 4
%41 = sext i32 %40 to i64
%42 = sub nsw i64 %41, 1
%43 = mul nsw i64 %42, 1
%44 = mul nsw i64 %43, 1
%45 = add nsw i64 %44, 0
%46 = mul nsw i64 1, %20
%47 = getelementptr float, ptr %1, i64 %45
%48 = load float, ptr %47, align 4
%49 = fadd contract float %39, %48
store float %49, ptr %5, align 4
%50 = trunc i64 %29 to i32
%51 = load i32, ptr %9, align 4
%52 = add nsw i32 %51, %50
%53 = sub i64 %36, 1
br label %34
54: ; preds = %34
store i32 %35, ptr %9, align 4
%55 = load float, ptr %5, align 4
%56 = call contract float @__pgi_shfl_xorf2(float %55, i32 1)
store float %56, ptr %6, align 4
%57 = load float, ptr %5, align 4
%58 = load float, ptr %6, align 4
%59 = fadd contract float %57, %58
store float %59, ptr %5, align 4
%60 = load float, ptr %5, align 4
%61 = call contract float @__pgi_shfl_xorf2(float %60, i32 2)
store float %61, ptr %6, align 4
%62 = load float, ptr %5, align 4
%63 = load float, ptr %6, align 4
%64 = fadd contract float %62, %63
store float %64, ptr %5, align 4
%65 = load float, ptr %5, align 4
%66 = call contract float @__pgi_shfl_xorf2(float %65, i32 4)
store float %66, ptr %6, align 4
%67 = load float, ptr %5, align 4
%68 = load float, ptr %6, align 4
%69 = fadd contract float %67, %68
store float %69, ptr %5, align 4
%70 = load float, ptr %5, align 4
%71 = call contract float @__pgi_shfl_xorf2(float %70, i32 8)
store float %71, ptr %6, align 4
%72 = load float, ptr %5, align 4
%73 = load float, ptr %6, align 4
%74 = fadd contract float %72, %73
store float %74, ptr %5, align 4
%75 = load float, ptr %5, align 4
%76 = call contract float @__pgi_shfl_xorf2(float %75, i32 16)
store float %76, ptr %6, align 4
%77 = load float, ptr %5, align 4
%78 = load float, ptr %6, align 4
%79 = fadd contract float %77, %78
store float %79, ptr %5, align 4
%80 = load float, ptr %5, align 4
%81 = sext i32 %14 to i64
%82 = sub nsw i64 %81, 1
%83 = mul nsw i64 %82, 1
%84 = mul nsw i64 %83, 1
%85 = add nsw i64 %84, 0
%86 = getelementptr float, ptr %0, i64 %85
store float %80, ptr %86, align 4
ret void
}
declare float @__pgi_shfl_xorf2(float, i32)
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}
```
Invoking opt for `nvptx64-nvidia-cuda` at `O1` or higher will unroll the loop in `_QMmPpartialsumshflshflr4`.
```
opt -mtriple=nvptx64-nvidia-cuda -O1 -S testcase.ll -o -
```
The problem is that this will try to access some data out of bounds.
```
; ModuleID = 'dummy.1.ll'
source_filename = "LLVMDialectModule"
target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define ptx_kernel void @_QMmPpartialsumshflshflr4(ptr writeonly captures(none) %0, ptr readonly captures(none) %1, i32 %2) local_unnamed_addr {
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%5 = add nuw nsw i32 %4, 1
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%9 = sext i32 %2 to i64
%10 = mul i32 %7, %8
%11 = mul i32 %6, %8
%12 = add i32 %5, %11
%13 = sext i32 %12 to i64
%14 = sext i32 %10 to i64
%15 = sub nsw i64 %14, %13
%16 = add nsw i64 %15, %9
%17 = sdiv i64 %16, %14
%invariant.gep = getelementptr i8, ptr %1, i64 -4
%18 = icmp sgt i64 %17, 0
br i1 %18, label %.lr.ph.preheader, label %._crit_edge
.lr.ph.preheader: ; preds = %3
%xtraiter = and i64 %17, 3
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %.lr.ph.prol.loopexit, label %.lr.ph.prol
.lr.ph.prol: ; preds = %.lr.ph.preheader, %.lr.ph.prol
%19 = phi i64 [ %25, %.lr.ph.prol ], [ %17, %.lr.ph.preheader ]
%20 = phi i32 [ %24, %.lr.ph.prol ], [ %12, %.lr.ph.preheader ]
%.023.prol = phi float [ %23, %.lr.ph.prol ], [ 0.000000e+00, %.lr.ph.preheader ]
%prol.iter = phi i64 [ %prol.iter.next, %.lr.ph.prol ], [ 0, %.lr.ph.preheader ]
%21 = sext i32 %20 to i64
%gep.prol = getelementptr float, ptr %invariant.gep, i64 %21
%22 = load float, ptr %gep.prol, align 4
%23 = fadd contract float %.023.prol, %22
%24 = add nsw i32 %20, %10
%25 = add nsw i64 %19, -1
%prol.iter.next = add i64 %prol.iter, 1
%prol.iter.cmp.not = icmp eq i64 %prol.iter.next, %xtraiter
br i1 %prol.iter.cmp.not, label %.lr.ph.prol.loopexit, label %.lr.ph.prol, !llvm.loop !1
.lr.ph.prol.loopexit: ; preds = %.lr.ph.prol, %.lr.ph.preheader
%.unr = phi i64 [ %17, %.lr.ph.preheader ], [ %25, %.lr.ph.prol ]
%.unr24 = phi i32 [ %12, %.lr.ph.preheader ], [ %24, %.lr.ph.prol ]
%.023.unr = phi float [ 0.000000e+00, %.lr.ph.preheader ], [ %23, %.lr.ph.prol ]
%.lcssa.unr = phi float [ poison, %.lr.ph.preheader ], [ %23, %.lr.ph.prol ]
%26 = icmp ult i64 %17, 4
br i1 %26, label %._crit_edge, label %.lr.ph.preheader.new
.lr.ph.preheader.new: ; preds = %.lr.ph.prol.loopexit
%invariant.op = add i32 %10, %10
%invariant.op29 = add i32 %invariant.op, %10
%invariant.op31 = add i32 %invariant.op29, %10
br label %.lr.ph
.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader.new
%27 = phi i64 [ %.unr, %.lr.ph.preheader.new ], [ %42, %.lr.ph ]
%28 = phi i32 [ %.unr24, %.lr.ph.preheader.new ], [ %.reass32, %.lr.ph ]
%.023 = phi float [ %.023.unr, %.lr.ph.preheader.new ], [ %41, %.lr.ph ]
%29 = sext i32 %28 to i64
%gep = getelementptr float, ptr %invariant.gep, i64 %29
%30 = load float, ptr %gep, align 4
%31 = fadd contract float %.023, %30
%32 = add nsw i32 %28, %10
%33 = sext i32 %32 to i64
%gep.1 = getelementptr float, ptr %invariant.gep, i64 %33
%34 = load float, ptr %gep.1, align 4
%35 = fadd contract float %31, %34
%.reass = add i32 %28, %invariant.op
%36 = sext i32 %.reass to i64
%gep.2 = getelementptr float, ptr %invariant.gep, i64 %36
%37 = load float, ptr %gep.2, align 4
%38 = fadd contract float %35, %37
%.reass30 = add i32 %28, %invariant.op29
%39 = sext i32 %.reass30 to i64
%gep.3 = getelementptr float, ptr %invariant.gep, i64 %39
%40 = load float, ptr %gep.3, align 4
%41 = fadd contract float %38, %40
%.reass32 = add i32 %28, %invariant.op31
%42 = add nsw i64 %27, -4
%43 = icmp sgt i64 %27, 4
br i1 %43, label %.lr.ph, label %._crit_edge
._crit_edge: ; preds = %.lr.ph.prol.loopexit, %.lr.ph, %3
%.0.lcssa = phi float [ 0.000000e+00, %3 ], [ %.lcssa.unr, %.lr.ph.prol.loopexit ], [ %41, %.lr.ph ]
%44 = tail call contract float @__pgi_shfl_xorf2(float %.0.lcssa, i32 1)
%45 = fadd contract float %.0.lcssa, %44
%46 = tail call contract float @__pgi_shfl_xorf2(float %45, i32 2)
%47 = fadd contract float %45, %46
%48 = tail call contract float @__pgi_shfl_xorf2(float %47, i32 4)
%49 = fadd contract float %47, %48
%50 = tail call contract float @__pgi_shfl_xorf2(float %49, i32 8)
%51 = fadd contract float %49, %50
%52 = tail call contract float @__pgi_shfl_xorf2(float %51, i32 16)
%53 = fadd contract float %51, %52
%54 = zext nneg i32 %6 to i64
%55 = getelementptr float, ptr %0, i64 %54
store float %53, ptr %55, align 4
ret void
}
declare float @__pgi_shfl_xorf2(float, i32) local_unnamed_addr
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !{!1, !2}
!2 = !{!"llvm.loop.unroll.disable"}
```
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzUW1lv4zgS_jXKC2GBIqnDD3lIJgjQwAxmL8xrIEu0zR2a0lKUO9lfv6Ak6qAo2UpPo3uDOeyoVBe_KhZZlbSq2ElQ-uiFz1748pDW6lzIx4zTCxXqmvKHQ5F_PL4WEqgzqwATZa3Ar7_-8Rv48g8fePDJi2D3D3zy8DP4rchrTr-8AA-_AA_FmvaFpZxmqn3kodiDT1VRy4y-HRmnIr3Qjho5qJFmDJ9yemSCglK9v_1JpaAcXAuWA4_At7__dvlbmUrFUl7Vl-p85PpfSTyUlEoCD4XQQ7-A7nOgPzOM9GfkoT3w4mcPPgH9nTR6pJwXWappGtKIgOadlLOTAMTQhmPaIy9StUYdbaKON-iRbKDdb6AN4BbioCHOUs5b3xLI-fXii-v14kua5n6p3v1K0pOvWO6_eyjx0L5_G7Wi8twsTNAICXoCfDf7TKUOAWQmAE8FhHcLEG4J0f0MbA9UqpB0BMkBqtDl6hYcvEhzsyqr5C0-KvquettjoAq9lD1NiwuWXUpQnVSzys2rmh80VAh2nHR0Aha0bxpU9PT6y_BOi4tLzXvhoSZqPOYwPhiZE9vWtKRN8ADow-aHeugZjsM7dPgAoZkWeKqFJsI2RhAyRMihKh4JTVxCydI6OanD2TIhYi0TijatPIrnLKMRS_09WeI4c76m3s8ZJpaOuAWJkrXIDC5Q2BBh1BO1qKjqQ08Sd65GYU80SgotEQ4MUR92uF20KmfXngpNqQ4S8PRAefOMtJsJJh5-Anq7KiXNq273CXHSvYp7_u2ylGfWWhw-NxgzInACvPCl-dI-wdA8aR4YLtHARSvZccGLXLCTS-yMUhwNUXqQXVzixp-D3cnka2jckLjdQOZwx-EIHHsHOPB-gFK_v63EJIFL0HNxJ8EMegROsUxQDypRfTXeIdONhOA-E4yJ0JSIOImmOwYJe3iOicgkZZJoxqnH8EDUruuJKtqWXdoPtguDUZ4lfYyQZNnpxA5h_bt2kY5a66wQSqaZ6hJqs4SdbiSZ5dvm5RtLGrpif2_FfhhsWfcQTb3cLn1onBj2TgyxnVLauAgWc0C4kAO2gl8LD7dhPxwVC_YyEPj2Vp7Ymy5h394LeUQeSvo1aNlptYJp9TBQRCPhkUPXeKOuKxCz2Tf0qxALTZ4PnRALb0Esgtu0j4LPejqCxtNowdNRsO7pCG3UFW_zdETWPB2ZPSrCTuXJLW02Yjr6NKajHtNkydM3MB1txHS0EdPRKqYjg-nIienoFqbjjZiOP43puMd0suDp-Aam442YjjdiOl7FdGwwHTsxHd_CdLwR0_GnMR0PeTpacvUNUMcbQR1vBHW8CurYgDp2gjq-BepkI6iTeYEX2OefxFnhJdMKL3FWeMm0wkucFV4yrfASZ4WXTCu8JLqjeIOj4i0JXf5MxkfYZLZckqrmuktXLvGLuRDLeDrwWMZjh8QWht0t3WstMsUKAZ6UkpUuhEShYX5Isz-BKI6SUiCK6kNkQBS1-MpEDqqSZjVPVXrgFHxlnEuqainAhV4K-eGhRBSCtlKMbvrdnB6BTMWJeijRK9tYGkCkk_2GuyLgIQx_HgNQQGKS4EhX2TfNmNwY_TyGNIGz6y1J7rBE_MymBBCF99jgRlWqlGSHWtGq-WV7Hoif_0IbQB-7Hgoa7S7NHbd_5OnJHEACL372UADHtHD0rKtIf2m-IvRCD_UJfBHHAvxBZcUK4bXXVpoOd0zGV_Tw6Yu4Fn8ycQJFqcCxkMCLoLiW6j0iO3FlOUt3WZ2nXgSBTiwR_D3QnwsJzux0prIxEdRCFpwDdaaAF0UJmNCky3fxEZy1C7T83UVJVnLq4ReHDgDsfg_A7p9A0UplaUV9zgHYFWA3N-tfZwpKWRw4vQBWAXVOVduzaNRV8kPvJmmW0aoCVXGhIE9VCopageIIDhpTld95_EZLI68vlw8_8Dn_TC9DpfJEVSOdpx9afkdPd2Xk4SeMmv_s9MaHnyKyYwFKPPwUoGR3DTRFEO2uhkoE_Tu6pB8JaP1qmLsW-NtbK18lU7QQ_ANkaalqSasx2kd9Fx2BK3SzngzXQfdWC-3P_C3Nczlv06iU8c92HEZbe_11fLVAJiVAtE2QszMQb-Phbi8kG5k4THZc3tptAGhfk5siMLH6PCOSaEYya-b0N_52P2dS7c20IXMaaNOEroowIEYgttozVi3X9yL2VnNlfJ8cGAuDXiwT11SyVCj_REtH_ccS183dzu7JzPotseMmN5he3fpc-uXZLyU90zSncvrwLZNMvdH8RNvInlE7b7yMWu9KpkxR2bpK5FO9cHcu4dml1BuXLwo1mEH_Y6gNF4ct41fdVhXc17sJfWeLBJZlBXca5fLTjM-4-WVdzqNw_oZ1Sx_ENkkna3xp3zXNrAYCIre5ozu4-xDh7uVOSHca6MTgdTHzJtoNec0C9RCxXNY_9AV9Vzck3-O6-ZEQ2QngRMvB_vVD2CRoRwcyFJhm2MrdhpHjOLl2fcOFc3S_QubaH1n9QetqGxnPBNBqDdq5qzmB74zyU-fbnbP-6WR3G97RcbkQzs5V7WN8Et4zfp-P8ba-bba3prr0ULN5TAN_YIOfwNrPcnbo12WWL0yE1cIJ9vXgH8J4MY-M-HdIsHLEegIYSVjKJeMcMbZiSBH3x_9I3FJOMeJ4VlXpgsCyYJU-ofwVgrpueAPXmlubKJkiE0WL--Ta5uoL-tW9kTZPvgF1A3RnJUVRzmZVpjnBou668iP68dNZNhk_7PrwC6-i_fTlcROtNWXsm5VdeGG9O--OZxWsINMoWnnZAgyZBswEK4krwtrYu1-ALrKrCi-L0aHm3olNEG6wJli25o45DHdheveeuLfmOZY2Rcd-2IFqZT_sDMPQMeox3gqTGXjx_NyA7XOD3qqDb7AdD8MfZL0gsAfhhnmRpQa7WVPca9tiajZ5ZEyfhHInIpr5oGMy9wP6Fj9E1uDJoh-QCwXJqiPMrojjqSM6uN1yRYvPYepk7gzsqhTxt7hj75pccbkDO9xBVoOinzsi0HLH7CztdAcOpjMwVqnYDlcNx89uCMY-fiLXztlOvdh5f_3QOfrF6tHMKgTt_QIPabUtKu6tYLCdt_uSZF5R9Brcn36Jff20rTVorPHsQY5hqGgxdw5vtrNG0zGjz2pE-mYlGusSr-lCTASTyBpG-rQWsWeNAdwcWCKmCCeJPYP0WSX2nt0hH2aWlrQwdVJoyrNuaumzSgzXoUPzeBhxWpqs6WeihlGrFqj_1clRCHrqr-ys1Njdht7fwwzn87j9PKPp8dpN3r-ohem4GV5rQF3qSpWyOElaVf__Lc4faM137Hf-GKu-f_PzB9r1fTqh38mgH9YWRV03I2eVYiJTg5gulwaop0RTPTyE-sspv22F-jmrtKla0LTt-pA_4nyP9-kDfQxikuz3YQjjh_NjdkwSGGZJeggTkqYoQjDMsiyjMTrGKMwf2COCKIQRJBDCEMZ-SPJsj9CRUBwcDyn1CKSXlHG_0aaQpwdWVTV9DAiK9vuHpkirmj_uQkifMZunWsPw5UE-6pd2h_pUaYywSlUDG8UUb_4qrCiVF7507V4mTuBUNL3jUd-0aSI7GowPteSPZ6VKHQ4eevXQ64mpc33ws-LioVctrPvfrpTFv2mmPPTaqFh56LWz4fqI_hcAAP___kFf0Q">