[llvm] 0dc0aeb - [LV] Add additional tests for replicating calls returning structs.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 22 05:49:14 PDT 2025
Author: Florian Hahn
Date: 2025-06-22T13:48:25+01:00
New Revision: 0dc0aeb14f1e38b84f4abca9d170e971e28d2ec3
URL: https://github.com/llvm/llvm-project/commit/0dc0aeb14f1e38b84f4abca9d170e971e28d2ec3
DIFF: https://github.com/llvm/llvm-project/commit/0dc0aeb14f1e38b84f4abca9d170e971e28d2ec3.diff
LOG: [LV] Add additional tests for replicating calls returning structs.
Add additional test coverage for replicating calls return structs, in
particular cases where the number of struct elements does not match the
VF.
Extra test coverage for
https://github.com/llvm/llvm-project/pull/142433.
Added:
llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
Modified:
llvm/test/Transforms/LoopVectorize/struct-return.ll
Removed:
################################################################################
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
new file mode 100644
index 0000000000000..fe53334cb25a7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -0,0 +1,484 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "middle.block:" --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF4 %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefix=VF2IC2 %s
+
+define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonly %out_a) {
+; VF4-LABEL: define void @struct_return_1xi64_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*:]]
+; VF4-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
+; VF4-NEXT: [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
+; VF4-NEXT: [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
+; VF4-NEXT: [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT: [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]]
+; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0
+; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
+; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x i64> } poison, <4 x i64> [[TMP11]], 0
+; VF4-NEXT: [[TMP13:%.*]] = extractvalue { i64 } [[TMP5]], 0
+; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x i64> } [[TMP12]], 0
+; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP13]], i32 1
+; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x i64> } [[TMP12]], <4 x i64> [[TMP15]], 0
+; VF4-NEXT: [[TMP17:%.*]] = extractvalue { i64 } [[TMP7]], 0
+; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i64> } [[TMP16]], 0
+; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP17]], i32 2
+; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x i64> } [[TMP16]], <4 x i64> [[TMP19]], 0
+; VF4-NEXT: [[TMP21:%.*]] = extractvalue { i64 } [[TMP9]], 0
+; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x i64> } [[TMP20]], 0
+; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP21]], i32 3
+; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x i64> } [[TMP20]], <4 x i64> [[TMP23]], 0
+; VF4-NEXT: [[TMP25:%.*]] = extractvalue { <4 x i64> } [[TMP24]], 0
+; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[TMP26]], i32 0
+; VF4-NEXT: store <4 x i64> [[TMP25]], ptr [[TMP27]], align 4
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_1xi64_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*:]]
+; VF2IC2-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2: [[VECTOR_PH]]:
+; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2IC2: [[VECTOR_BODY]]:
+; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]]
+; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]]
+; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0
+; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0
+; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { i64 } [[TMP6]], 0
+; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0
+; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
+; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0
+; VF2IC2-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT: [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]]
+; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT: [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]]
+; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0
+; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2IC2-NEXT: [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0
+; VF2IC2-NEXT: [[TMP21:%.*]] = extractvalue { i64 } [[TMP17]], 0
+; VF2IC2-NEXT: [[TMP22:%.*]] = extractvalue { <2 x i64> } [[TMP20]], 0
+; VF2IC2-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1
+; VF2IC2-NEXT: [[TMP24:%.*]] = insertvalue { <2 x i64> } [[TMP20]], <2 x i64> [[TMP23]], 0
+; VF2IC2-NEXT: [[TMP25:%.*]] = extractvalue { <2 x i64> } [[TMP13]], 0
+; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { <2 x i64> } [[TMP24]], 0
+; VF2IC2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 0
+; VF2IC2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 2
+; VF2IC2-NEXT: store <2 x i64> [[TMP25]], ptr [[TMP28]], align 4
+; VF2IC2-NEXT: store <2 x i64> [[TMP26]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2: [[MIDDLE_BLOCK]]:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+ %in_val = load float, ptr %arrayidx, align 4
+ %call = tail call { i64 } @fn1(float %in_val) #0
+ %extract_a = extractvalue { i64 } %call, 0
+ %arrayidx2 = getelementptr inbounds i64, ptr %out_a, i64 %iv
+ store i64 %extract_a, ptr %arrayidx2, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; VF4-LABEL: define void @struct_return_2xf32_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*:]]
+; VF4-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
+; VF4-NEXT: [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
+; VF4-NEXT: [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
+; VF4-NEXT: [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT: [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]]
+; VF4-NEXT: [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0
+; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
+; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP11]], 0
+; VF4-NEXT: [[TMP13:%.*]] = extractvalue { float, float } [[TMP3]], 1
+; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 1
+; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP13]], i32 0
+; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP15]], 1
+; VF4-NEXT: [[TMP17:%.*]] = extractvalue { float, float } [[TMP5]], 0
+; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP16]], 0
+; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP17]], i32 1
+; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP16]], <4 x float> [[TMP19]], 0
+; VF4-NEXT: [[TMP21:%.*]] = extractvalue { float, float } [[TMP5]], 1
+; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP20]], 1
+; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP21]], i32 1
+; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP20]], <4 x float> [[TMP23]], 1
+; VF4-NEXT: [[TMP25:%.*]] = extractvalue { float, float } [[TMP7]], 0
+; VF4-NEXT: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0
+; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP25]], i32 2
+; VF4-NEXT: [[TMP28:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP27]], 0
+; VF4-NEXT: [[TMP29:%.*]] = extractvalue { float, float } [[TMP7]], 1
+; VF4-NEXT: [[TMP30:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP28]], 1
+; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP29]], i32 2
+; VF4-NEXT: [[TMP32:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP28]], <4 x float> [[TMP31]], 1
+; VF4-NEXT: [[TMP33:%.*]] = extractvalue { float, float } [[TMP9]], 0
+; VF4-NEXT: [[TMP34:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP32]], 0
+; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP33]], i32 3
+; VF4-NEXT: [[TMP36:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP32]], <4 x float> [[TMP35]], 0
+; VF4-NEXT: [[TMP37:%.*]] = extractvalue { float, float } [[TMP9]], 1
+; VF4-NEXT: [[TMP38:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 1
+; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP37]], i32 3
+; VF4-NEXT: [[TMP40:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP39]], 1
+; VF4-NEXT: [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 0
+; VF4-NEXT: [[TMP42:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 1
+; VF4-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i32 0
+; VF4-NEXT: store <4 x float> [[TMP41]], ptr [[TMP44]], align 4
+; VF4-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
+; VF4-NEXT: store <4 x float> [[TMP42]], ptr [[TMP46]], align 4
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_2xf32_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*:]]
+; VF2IC2-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2: [[VECTOR_PH]]:
+; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2IC2: [[VECTOR_BODY]]:
+; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]]
+; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]]
+; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0
+; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0
+; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { float, float } [[TMP4]], 1
+; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP9]], 1
+; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP10]], i32 0
+; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP9]], <2 x float> [[TMP12]], 1
+; VF2IC2-NEXT: [[TMP14:%.*]] = extractvalue { float, float } [[TMP6]], 0
+; VF2IC2-NEXT: [[TMP15:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP13]], 0
+; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i32 1
+; VF2IC2-NEXT: [[TMP17:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP13]], <2 x float> [[TMP16]], 0
+; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { float, float } [[TMP6]], 1
+; VF2IC2-NEXT: [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1
+; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i32 1
+; VF2IC2-NEXT: [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1
+; VF2IC2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT: [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]]
+; VF2IC2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT: [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]]
+; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0
+; VF2IC2-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i32 0
+; VF2IC2-NEXT: [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0
+; VF2IC2-NEXT: [[TMP29:%.*]] = extractvalue { float, float } [[TMP23]], 1
+; VF2IC2-NEXT: [[TMP30:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP28]], 1
+; VF2IC2-NEXT: [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i32 0
+; VF2IC2-NEXT: [[TMP32:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP28]], <2 x float> [[TMP31]], 1
+; VF2IC2-NEXT: [[TMP33:%.*]] = extractvalue { float, float } [[TMP25]], 0
+; VF2IC2-NEXT: [[TMP34:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP32]], 0
+; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i32 1
+; VF2IC2-NEXT: [[TMP36:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP32]], <2 x float> [[TMP35]], 0
+; VF2IC2-NEXT: [[TMP37:%.*]] = extractvalue { float, float } [[TMP25]], 1
+; VF2IC2-NEXT: [[TMP38:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP36]], 1
+; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i32 1
+; VF2IC2-NEXT: [[TMP40:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP36]], <2 x float> [[TMP39]], 1
+; VF2IC2-NEXT: [[TMP41:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 0
+; VF2IC2-NEXT: [[TMP42:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 0
+; VF2IC2-NEXT: [[TMP43:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 1
+; VF2IC2-NEXT: [[TMP44:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 1
+; VF2IC2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
+; VF2IC2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 2
+; VF2IC2-NEXT: store <2 x float> [[TMP41]], ptr [[TMP46]], align 4
+; VF2IC2-NEXT: store <2 x float> [[TMP42]], ptr [[TMP47]], align 4
+; VF2IC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
+; VF2IC2-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 2
+; VF2IC2-NEXT: store <2 x float> [[TMP43]], ptr [[TMP49]], align 4
+; VF2IC2-NEXT: store <2 x float> [[TMP44]], ptr [[TMP50]], align 4
+; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2: [[MIDDLE_BLOCK]]:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+ %in_val = load float, ptr %arrayidx, align 4
+ %call = tail call { float, float } @fn2(float %in_val) #1
+ %extract_a = extractvalue { float, float } %call, 0
+ %extract_b = extractvalue { float, float } %call, 1
+ %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+ store float %extract_a, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+ store float %extract_b, ptr %arrayidx4, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+
+define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonly %dst.a, ptr noalias %dst.b, ptr noalias %dst.c) {
+; VF4-LABEL: define void @struct_return_3xi32_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*:]]
+; VF4-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; VF4-NEXT: [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; VF4-NEXT: [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; VF4-NEXT: [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; VF4-NEXT: [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]]
+; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0
+; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP11]], 0
+; VF4-NEXT: [[TMP13:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 1
+; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 1
+; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP13]], i32 0
+; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], <4 x i32> [[TMP15]], 1
+; VF4-NEXT: [[TMP17:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 2
+; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], 2
+; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP17]], i32 0
+; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], <4 x i32> [[TMP19]], 2
+; VF4-NEXT: [[TMP21:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 0
+; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], 0
+; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP21]], i32 1
+; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], <4 x i32> [[TMP23]], 0
+; VF4-NEXT: [[TMP25:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 1
+; VF4-NEXT: [[TMP26:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], 1
+; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP25]], i32 1
+; VF4-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], <4 x i32> [[TMP27]], 1
+; VF4-NEXT: [[TMP29:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 2
+; VF4-NEXT: [[TMP30:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], 2
+; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP29]], i32 1
+; VF4-NEXT: [[TMP32:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], <4 x i32> [[TMP31]], 2
+; VF4-NEXT: [[TMP33:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 0
+; VF4-NEXT: [[TMP34:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], 0
+; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP33]], i32 2
+; VF4-NEXT: [[TMP36:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], <4 x i32> [[TMP35]], 0
+; VF4-NEXT: [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 1
+; VF4-NEXT: [[TMP38:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], 1
+; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP37]], i32 2
+; VF4-NEXT: [[TMP40:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], <4 x i32> [[TMP39]], 1
+; VF4-NEXT: [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 2
+; VF4-NEXT: [[TMP42:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], 2
+; VF4-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP41]], i32 2
+; VF4-NEXT: [[TMP44:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], <4 x i32> [[TMP43]], 2
+; VF4-NEXT: [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 0
+; VF4-NEXT: [[TMP46:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], 0
+; VF4-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP45]], i32 3
+; VF4-NEXT: [[TMP48:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], <4 x i32> [[TMP47]], 0
+; VF4-NEXT: [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 1
+; VF4-NEXT: [[TMP50:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], 1
+; VF4-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP49]], i32 3
+; VF4-NEXT: [[TMP52:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], <4 x i32> [[TMP51]], 1
+; VF4-NEXT: [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 2
+; VF4-NEXT: [[TMP54:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], 2
+; VF4-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP53]], i32 3
+; VF4-NEXT: [[TMP56:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], <4 x i32> [[TMP55]], 2
+; VF4-NEXT: [[TMP57:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 0
+; VF4-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 0
+; VF4-NEXT: store <4 x i32> [[TMP57]], ptr [[TMP59]], align 4
+; VF4-NEXT: [[TMP60:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 1
+; VF4-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 0
+; VF4-NEXT: store <4 x i32> [[TMP60]], ptr [[TMP62]], align 4
+; VF4-NEXT: [[TMP63:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 2
+; VF4-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
+; VF4-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
+; VF4-NEXT: store <4 x i32> [[TMP63]], ptr [[TMP65]], align 4
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT: br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_3xi32_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) {
+; VF2IC2-NEXT: [[ENTRY:.*:]]
+; VF2IC2-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2: [[VECTOR_PH]]:
+; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2IC2: [[VECTOR_BODY]]:
+; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
+; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]]
+; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0
+; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0
+; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 1
+; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], 1
+; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i32 0
+; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], <2 x i32> [[TMP12]], 1
+; VF2IC2-NEXT: [[TMP14:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 2
+; VF2IC2-NEXT: [[TMP15:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], 2
+; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 0
+; VF2IC2-NEXT: [[TMP17:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], <2 x i32> [[TMP16]], 2
+; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 0
+; VF2IC2-NEXT: [[TMP19:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], 0
+; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i32 1
+; VF2IC2-NEXT: [[TMP21:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], <2 x i32> [[TMP20]], 0
+; VF2IC2-NEXT: [[TMP22:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 1
+; VF2IC2-NEXT: [[TMP23:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], 1
+; VF2IC2-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP22]], i32 1
+; VF2IC2-NEXT: [[TMP25:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], <2 x i32> [[TMP24]], 1
+; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 2
+; VF2IC2-NEXT: [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2
+; VF2IC2-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i32 1
+; VF2IC2-NEXT: [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2
+; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT: [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]]
+; VF2IC2-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT: [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]]
+; VF2IC2-NEXT: [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0
+; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0
+; VF2IC2-NEXT: [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0
+; VF2IC2-NEXT: [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 1
+; VF2IC2-NEXT: [[TMP38:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], 1
+; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP37]], i32 0
+; VF2IC2-NEXT: [[TMP40:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], <2 x i32> [[TMP39]], 1
+; VF2IC2-NEXT: [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 2
+; VF2IC2-NEXT: [[TMP42:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], 2
+; VF2IC2-NEXT: [[TMP43:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[TMP41]], i32 0
+; VF2IC2-NEXT: [[TMP44:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], <2 x i32> [[TMP43]], 2
+; VF2IC2-NEXT: [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 0
+; VF2IC2-NEXT: [[TMP46:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], 0
+; VF2IC2-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> [[TMP46]], i32 [[TMP45]], i32 1
+; VF2IC2-NEXT: [[TMP48:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], <2 x i32> [[TMP47]], 0
+; VF2IC2-NEXT: [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 1
+; VF2IC2-NEXT: [[TMP50:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], 1
+; VF2IC2-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP50]], i32 [[TMP49]], i32 1
+; VF2IC2-NEXT: [[TMP52:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], <2 x i32> [[TMP51]], 1
+; VF2IC2-NEXT: [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 2
+; VF2IC2-NEXT: [[TMP54:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], 2
+; VF2IC2-NEXT: [[TMP55:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP53]], i32 1
+; VF2IC2-NEXT: [[TMP56:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], <2 x i32> [[TMP55]], 2
+; VF2IC2-NEXT: [[TMP57:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 0
+; VF2IC2-NEXT: [[TMP58:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 0
+; VF2IC2-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 0
+; VF2IC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 2
+; VF2IC2-NEXT: store <2 x i32> [[TMP57]], ptr [[TMP60]], align 4
+; VF2IC2-NEXT: store <2 x i32> [[TMP58]], ptr [[TMP61]], align 4
+; VF2IC2-NEXT: [[TMP62:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 1
+; VF2IC2-NEXT: [[TMP63:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 1
+; VF2IC2-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
+; VF2IC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 2
+; VF2IC2-NEXT: store <2 x i32> [[TMP62]], ptr [[TMP65]], align 4
+; VF2IC2-NEXT: store <2 x i32> [[TMP63]], ptr [[TMP66]], align 4
+; VF2IC2-NEXT: [[TMP67:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 2
+; VF2IC2-NEXT: [[TMP68:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 2
+; VF2IC2-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
+; VF2IC2-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 0
+; VF2IC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2
+; VF2IC2-NEXT: store <2 x i32> [[TMP67]], ptr [[TMP70]], align 4
+; VF2IC2-NEXT: store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4
+; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2IC2: [[MIDDLE_BLOCK]]:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, ptr %in, i64 %iv
+ %in_val = load i32, ptr %arrayidx, align 4
+ %call = tail call { i32, i32, i32 } @fn3(i32 %in_val) #2
+ %extract_a = extractvalue { i32, i32, i32 } %call, 0
+ %gep.dst.a = getelementptr inbounds i32, ptr %dst.a, i64 %iv
+ store i32 %extract_a, ptr %gep.dst.a, align 4
+ %extract_b = extractvalue { i32, i32, i32 } %call, 1
+ %gep.dst.b = getelementptr inbounds i32, ptr %dst.b, i64 %iv
+ store i32 %extract_b, ptr %gep.dst.b, align 4
+ %extract_c = extractvalue { i32, i32, i32 } %call, 2
+ %gep.dst.c = getelementptr inbounds i32, ptr %dst.c, i64 %iv
+ store i32 %extract_c, ptr %gep.dst.c, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+declare { i64 } @fn1(float)
+declare { float, float } @fn2(float)
+declare { i32, i32, i32 } @fn3(i32)
+
+declare { <8 x i64> } @fixed_vec_fn1(<8 x float>)
+declare { <8 x float>, <8 x float> } @fixed_vec_fn2(<8 x float>)
+declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" }
+attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 50b9ba12af82d..6d849c01f4058 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -69,59 +69,6 @@ exit:
ret void
}
-; CHECK-REMARKS: remark: {{.*}} vectorized loop
-; Note: Later instcombines reduce this down quite a lot.
-define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
-; CHECK-LABEL: define void @struct_return_f32_replicate
-; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
-; CHECK: vector.body:
-; CHECK: [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}})
-; CHECK: [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
-; // Lane 0
-; CHECK: [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
-; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK: [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0
-; CHECK: [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
-; CHECK: [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1
-; CHECK: [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0
-; CHECK: [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1
-; // Lane 1
-; CHECK: [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
-; CHECK: [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0
-; CHECK: [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1
-; CHECK: [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0
-; CHECK: [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
-; CHECK: [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1
-; CHECK: [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1
-; CHECK: [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1
-; // Store wide values:
-; CHECK: [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0
-; CHECK: [[VEC_B_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 1
-; CHECK: store <2 x float> [[VEC_A_EXT]], ptr {{%.*}}, align 4
-; CHECK: store <2 x float> [[VEC_B_EXT]], ptr {{%.*}}, align 4
-entry:
- br label %for.body
-
-for.body:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
- %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
- %in_val = load float, ptr %arrayidx, align 4
- ; #3 does not have a fixed-size vector mapping (so replication is used)
- %call = tail call { float, float } @foo(float %in_val) #3
- %extract_a = extractvalue { float, float } %call, 0
- %extract_b = extractvalue { float, float } %call, 1
- %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
- store float %extract_a, ptr %arrayidx2, align 4
- %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
- store float %extract_b, ptr %arrayidx4, align 4
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, 1024
- br i1 %exitcond.not, label %exit, label %for.body
-
-exit:
- ret void
-}
-
; CHECK-REMARKS: remark: {{.*}} vectorized loop
define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
More information about the llvm-commits
mailing list