[llvm] [LoadStoreVectorizer] Allow vectorization of partially overlapped vector-stores (PR #169946)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 10:57:47 PST 2025
================
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @onevec(ptr %ptr, <1 x i32> %sd0, i32 %sd1, i32 %sd2, <1 x i32> %sd3, <1 x i32> %sd4, <1 x i32> %sd5) {
+; CHECK-LABEL: define void @onevec(
+; CHECK-SAME: ptr [[PTR:%.*]], <1 x i32> [[SD0:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], <1 x i32> [[SD3:%.*]], <1 x i32> [[SD4:%.*]], <1 x i32> [[SD5:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[SD0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x i32> [[TMP2]], i32 [[SD1]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP3]], ptr [[PTR]], align 4
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[SD2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i32> [[SD3]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <1 x i32> [[TMP4]], i32 [[TMP5]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP6]], ptr [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 32
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i32> [[SD4]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <1 x i32> [[SD5]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <1 x i32> [[TMP8]], i32 [[TMP9]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP10]], ptr [[GEP2]], align 4
+; CHECK-NEXT: ret void
+;
+ store <1 x i32> %sd0, ptr %ptr, align 4
+ store i32 %sd1, ptr %ptr, align 4
+
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 16
+ store i32 %sd2, ptr %gep1, align 4
+ store <1 x i32> %sd3, ptr %gep1, align 4
+
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 32
+ store <1 x i32> %sd4, ptr %gep2, align 4
+ store <1 x i32> %sd5, ptr %gep2, align 4
+ ret void
+}
+
+define void @test(ptr %ptr, i32 %sd0, <2 x i32> %sd1, <2 x i32> %sd2, i32 %sd3) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[SD0:%.*]], <2 x i32> [[SD1:%.*]], <2 x i32> [[SD2:%.*]], i32 [[SD3:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SD0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SD1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[SD1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[SD2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[SD2]], i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[SD3]], i32 2
+; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[PTR]], align 4
+; CHECK-NEXT: ret void
+;
+ store i32 %sd0, ptr %ptr, align 4
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+ store <2 x i32> %sd1, ptr %gep1, align 4
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
+ store <2 x i32> %sd2, ptr %gep2, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
+ store i32 %sd3, ptr %gep3, align 4
+ ret void
----------------
dakersnar wrote:
Also just want to confirm, does AMD allow misaligned stores?
https://github.com/llvm/llvm-project/pull/169946
More information about the llvm-commits
mailing list