[llvm] 0dc0aeb - [LV] Add additional tests for replicating calls returning structs.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 22 05:49:14 PDT 2025


Author: Florian Hahn
Date: 2025-06-22T13:48:25+01:00
New Revision: 0dc0aeb14f1e38b84f4abca9d170e971e28d2ec3

URL: https://github.com/llvm/llvm-project/commit/0dc0aeb14f1e38b84f4abca9d170e971e28d2ec3
DIFF: https://github.com/llvm/llvm-project/commit/0dc0aeb14f1e38b84f4abca9d170e971e28d2ec3.diff

LOG: [LV] Add additional tests for replicating calls returning structs.

Add additional test coverage for replicating calls return structs, in
particular cases where the number of struct elements does not match the
VF.

Extra test coverage for
https://github.com/llvm/llvm-project/pull/142433.

Added: 
    llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll

Modified: 
    llvm/test/Transforms/LoopVectorize/struct-return.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
new file mode 100644
index 0000000000000..fe53334cb25a7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -0,0 +1,484 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "middle.block:" --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF4 %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefix=VF2IC2 %s
+
+define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonly %out_a) {
+; VF4-LABEL: define void @struct_return_1xi64_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0
+; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP12:%.*]] = insertvalue { <4 x i64> } poison, <4 x i64> [[TMP11]], 0
+; VF4-NEXT:    [[TMP13:%.*]] = extractvalue { i64 } [[TMP5]], 0
+; VF4-NEXT:    [[TMP14:%.*]] = extractvalue { <4 x i64> } [[TMP12]], 0
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP16:%.*]] = insertvalue { <4 x i64> } [[TMP12]], <4 x i64> [[TMP15]], 0
+; VF4-NEXT:    [[TMP17:%.*]] = extractvalue { i64 } [[TMP7]], 0
+; VF4-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i64> } [[TMP16]], 0
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP17]], i32 2
+; VF4-NEXT:    [[TMP20:%.*]] = insertvalue { <4 x i64> } [[TMP16]], <4 x i64> [[TMP19]], 0
+; VF4-NEXT:    [[TMP21:%.*]] = extractvalue { i64 } [[TMP9]], 0
+; VF4-NEXT:    [[TMP22:%.*]] = extractvalue { <4 x i64> } [[TMP20]], 0
+; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x i64> } [[TMP20]], <4 x i64> [[TMP23]], 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractvalue { <4 x i64> } [[TMP24]], 0
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[TMP26]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP25]], ptr [[TMP27]], align 4
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_1xi64_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0
+; VF2IC2-NEXT:    [[TMP10:%.*]] = extractvalue { i64 } [[TMP6]], 0
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0
+; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
+; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2IC2-NEXT:    [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractvalue { i64 } [[TMP17]], 0
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractvalue { <2 x i64> } [[TMP20]], 0
+; VF2IC2-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertvalue { <2 x i64> } [[TMP20]], <2 x i64> [[TMP23]], 0
+; VF2IC2-NEXT:    [[TMP25:%.*]] = extractvalue { <2 x i64> } [[TMP13]], 0
+; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { <2 x i64> } [[TMP24]], 0
+; VF2IC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 2
+; VF2IC2-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP28]], align 4
+; VF2IC2-NEXT:    store <2 x i64> [[TMP26]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { i64 } @fn1(float %in_val) #0
+  %extract_a = extractvalue { i64 } %call, 0
+  %arrayidx2 = getelementptr inbounds i64, ptr %out_a, i64 %iv
+  store i64 %extract_a, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; VF4-LABEL: define void @struct_return_2xf32_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0
+; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP12:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP11]], 0
+; VF4-NEXT:    [[TMP13:%.*]] = extractvalue { float, float } [[TMP3]], 1
+; VF4-NEXT:    [[TMP14:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 1
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP13]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP15]], 1
+; VF4-NEXT:    [[TMP17:%.*]] = extractvalue { float, float } [[TMP5]], 0
+; VF4-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP16]], 0
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP17]], i32 1
+; VF4-NEXT:    [[TMP20:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP16]], <4 x float> [[TMP19]], 0
+; VF4-NEXT:    [[TMP21:%.*]] = extractvalue { float, float } [[TMP5]], 1
+; VF4-NEXT:    [[TMP22:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP20]], 1
+; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP20]], <4 x float> [[TMP23]], 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractvalue { float, float } [[TMP7]], 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP25]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP27]], 0
+; VF4-NEXT:    [[TMP29:%.*]] = extractvalue { float, float } [[TMP7]], 1
+; VF4-NEXT:    [[TMP30:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP28]], 1
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP29]], i32 2
+; VF4-NEXT:    [[TMP32:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP28]], <4 x float> [[TMP31]], 1
+; VF4-NEXT:    [[TMP33:%.*]] = extractvalue { float, float } [[TMP9]], 0
+; VF4-NEXT:    [[TMP34:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP32]], 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP36:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP32]], <4 x float> [[TMP35]], 0
+; VF4-NEXT:    [[TMP37:%.*]] = extractvalue { float, float } [[TMP9]], 1
+; VF4-NEXT:    [[TMP38:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 1
+; VF4-NEXT:    [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP37]], i32 3
+; VF4-NEXT:    [[TMP40:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP39]], 1
+; VF4-NEXT:    [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 0
+; VF4-NEXT:    [[TMP42:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 1
+; VF4-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i32 0
+; VF4-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP44]], align 4
+; VF4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
+; VF4-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP46]], align 4
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_2xf32_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0
+; VF2IC2-NEXT:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP4]], 1
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP9]], 1
+; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP10]], i32 0
+; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP9]], <2 x float> [[TMP12]], 1
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractvalue { float, float } [[TMP6]], 0
+; VF2IC2-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP13]], 0
+; VF2IC2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i32 1
+; VF2IC2-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP13]], <2 x float> [[TMP16]], 0
+; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { float, float } [[TMP6]], 1
+; VF2IC2-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1
+; VF2IC2-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i32 1
+; VF2IC2-NEXT:    [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0
+; VF2IC2-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0
+; VF2IC2-NEXT:    [[TMP29:%.*]] = extractvalue { float, float } [[TMP23]], 1
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP28]], 1
+; VF2IC2-NEXT:    [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP28]], <2 x float> [[TMP31]], 1
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractvalue { float, float } [[TMP25]], 0
+; VF2IC2-NEXT:    [[TMP34:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP32]], 0
+; VF2IC2-NEXT:    [[TMP35:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP32]], <2 x float> [[TMP35]], 0
+; VF2IC2-NEXT:    [[TMP37:%.*]] = extractvalue { float, float } [[TMP25]], 1
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP36]], 1
+; VF2IC2-NEXT:    [[TMP39:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i32 1
+; VF2IC2-NEXT:    [[TMP40:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP36]], <2 x float> [[TMP39]], 1
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 0
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 1
+; VF2IC2-NEXT:    [[TMP44:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 1
+; VF2IC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
+; VF2IC2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 2
+; VF2IC2-NEXT:    store <2 x float> [[TMP41]], ptr [[TMP46]], align 4
+; VF2IC2-NEXT:    store <2 x float> [[TMP42]], ptr [[TMP47]], align 4
+; VF2IC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
+; VF2IC2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 2
+; VF2IC2-NEXT:    store <2 x float> [[TMP43]], ptr [[TMP49]], align 4
+; VF2IC2-NEXT:    store <2 x float> [[TMP44]], ptr [[TMP50]], align 4
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @fn2(float %in_val) #1
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonly %dst.a, ptr noalias %dst.b, ptr noalias %dst.c) {
+; VF4-LABEL: define void @struct_return_3xi32_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0
+; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP12:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP11]], 0
+; VF4-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 1
+; VF4-NEXT:    [[TMP14:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 1
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP13]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], <4 x i32> [[TMP15]], 1
+; VF4-NEXT:    [[TMP17:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 2
+; VF4-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP17]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], <4 x i32> [[TMP19]], 2
+; VF4-NEXT:    [[TMP21:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 0
+; VF4-NEXT:    [[TMP22:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], 0
+; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], <4 x i32> [[TMP23]], 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], 1
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP25]], i32 1
+; VF4-NEXT:    [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], <4 x i32> [[TMP27]], 1
+; VF4-NEXT:    [[TMP29:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 2
+; VF4-NEXT:    [[TMP30:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], 2
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP29]], i32 1
+; VF4-NEXT:    [[TMP32:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], <4 x i32> [[TMP31]], 2
+; VF4-NEXT:    [[TMP33:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 0
+; VF4-NEXT:    [[TMP34:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP33]], i32 2
+; VF4-NEXT:    [[TMP36:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], <4 x i32> [[TMP35]], 0
+; VF4-NEXT:    [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 1
+; VF4-NEXT:    [[TMP38:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], 1
+; VF4-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP37]], i32 2
+; VF4-NEXT:    [[TMP40:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], <4 x i32> [[TMP39]], 1
+; VF4-NEXT:    [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 2
+; VF4-NEXT:    [[TMP42:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], 2
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP41]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], <4 x i32> [[TMP43]], 2
+; VF4-NEXT:    [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 0
+; VF4-NEXT:    [[TMP46:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], 0
+; VF4-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP45]], i32 3
+; VF4-NEXT:    [[TMP48:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], <4 x i32> [[TMP47]], 0
+; VF4-NEXT:    [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 1
+; VF4-NEXT:    [[TMP50:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], 1
+; VF4-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP49]], i32 3
+; VF4-NEXT:    [[TMP52:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], <4 x i32> [[TMP51]], 1
+; VF4-NEXT:    [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 2
+; VF4-NEXT:    [[TMP54:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], 2
+; VF4-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP53]], i32 3
+; VF4-NEXT:    [[TMP56:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], <4 x i32> [[TMP55]], 2
+; VF4-NEXT:    [[TMP57:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 0
+; VF4-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 0
+; VF4-NEXT:    store <4 x i32> [[TMP57]], ptr [[TMP59]], align 4
+; VF4-NEXT:    [[TMP60:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 1
+; VF4-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 0
+; VF4-NEXT:    store <4 x i32> [[TMP60]], ptr [[TMP62]], align 4
+; VF4-NEXT:    [[TMP63:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 2
+; VF4-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
+; VF4-NEXT:    store <4 x i32> [[TMP63]], ptr [[TMP65]], align 4
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT:    br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_3xi32_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0
+; VF2IC2-NEXT:    [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 1
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], 1
+; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i32 0
+; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], <2 x i32> [[TMP12]], 1
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 2
+; VF2IC2-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], 2
+; VF2IC2-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 0
+; VF2IC2-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], <2 x i32> [[TMP16]], 2
+; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 0
+; VF2IC2-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], 0
+; VF2IC2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i32 1
+; VF2IC2-NEXT:    [[TMP21:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], <2 x i32> [[TMP20]], 0
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 1
+; VF2IC2-NEXT:    [[TMP23:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], 1
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP22]], i32 1
+; VF2IC2-NEXT:    [[TMP25:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], <2 x i32> [[TMP24]], 1
+; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 2
+; VF2IC2-NEXT:    [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2
+; VF2IC2-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0
+; VF2IC2-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0
+; VF2IC2-NEXT:    [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0
+; VF2IC2-NEXT:    [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 1
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], 1
+; VF2IC2-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP37]], i32 0
+; VF2IC2-NEXT:    [[TMP40:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], <2 x i32> [[TMP39]], 1
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 2
+; VF2IC2-NEXT:    [[TMP42:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], 2
+; VF2IC2-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[TMP41]], i32 0
+; VF2IC2-NEXT:    [[TMP44:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], <2 x i32> [[TMP43]], 2
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 0
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], 0
+; VF2IC2-NEXT:    [[TMP47:%.*]] = insertelement <2 x i32> [[TMP46]], i32 [[TMP45]], i32 1
+; VF2IC2-NEXT:    [[TMP48:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], <2 x i32> [[TMP47]], 0
+; VF2IC2-NEXT:    [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 1
+; VF2IC2-NEXT:    [[TMP50:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], 1
+; VF2IC2-NEXT:    [[TMP51:%.*]] = insertelement <2 x i32> [[TMP50]], i32 [[TMP49]], i32 1
+; VF2IC2-NEXT:    [[TMP52:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], <2 x i32> [[TMP51]], 1
+; VF2IC2-NEXT:    [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 2
+; VF2IC2-NEXT:    [[TMP54:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], 2
+; VF2IC2-NEXT:    [[TMP55:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP53]], i32 1
+; VF2IC2-NEXT:    [[TMP56:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], <2 x i32> [[TMP55]], 2
+; VF2IC2-NEXT:    [[TMP57:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 0
+; VF2IC2-NEXT:    [[TMP58:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 0
+; VF2IC2-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 0
+; VF2IC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[TMP57]], ptr [[TMP60]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP58]], ptr [[TMP61]], align 4
+; VF2IC2-NEXT:    [[TMP62:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 1
+; VF2IC2-NEXT:    [[TMP63:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
+; VF2IC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[TMP62]], ptr [[TMP65]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP63]], ptr [[TMP66]], align 4
+; VF2IC2-NEXT:    [[TMP67:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 2
+; VF2IC2-NEXT:    [[TMP68:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 2
+; VF2IC2-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 0
+; VF2IC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[TMP67]], ptr [[TMP70]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %iv
+  %in_val = load i32, ptr %arrayidx, align 4
+  %call = tail call { i32, i32, i32 } @fn3(i32 %in_val) #2
+  %extract_a = extractvalue { i32, i32, i32 } %call, 0
+  %gep.dst.a = getelementptr inbounds i32, ptr %dst.a, i64 %iv
+  store i32 %extract_a, ptr %gep.dst.a, align 4
+  %extract_b = extractvalue { i32, i32, i32 } %call, 1
+  %gep.dst.b = getelementptr inbounds i32, ptr %dst.b, i64 %iv
+  store i32 %extract_b, ptr %gep.dst.b, align 4
+  %extract_c = extractvalue { i32, i32, i32 } %call, 2
+  %gep.dst.c = getelementptr inbounds i32, ptr %dst.c, i64 %iv
+  store i32 %extract_c, ptr %gep.dst.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare { i64 } @fn1(float)
+declare { float, float } @fn2(float)
+declare { i32, i32, i32 } @fn3(i32)
+
+declare { <8 x i64> } @fixed_vec_fn1(<8 x float>)
+declare { <8 x float>, <8 x float> } @fixed_vec_fn2(<8 x float>)
+declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" }
+attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" }

diff  --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 50b9ba12af82d..6d849c01f4058 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -69,59 +69,6 @@ exit:
   ret void
 }
 
-; CHECK-REMARKS: remark: {{.*}} vectorized loop
-; Note: Later instcombines reduce this down quite a lot.
-define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
-; CHECK-LABEL: define void @struct_return_f32_replicate
-; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
-; CHECK:       vector.body:
-; CHECK:         [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}})
-; CHECK:         [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
-;                // Lane 0
-; CHECK:         [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
-; CHECK:         [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK:         [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0
-; CHECK:         [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
-; CHECK:         [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1
-; CHECK:         [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0
-; CHECK:         [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1
-;                // Lane 1
-; CHECK:         [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
-; CHECK:         [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0
-; CHECK:         [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1
-; CHECK:         [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0
-; CHECK:         [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
-; CHECK:         [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1
-; CHECK:         [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1
-; CHECK:         [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1
-;                // Store wide values:
-; CHECK:         [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0
-; CHECK:         [[VEC_B_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 1
-; CHECK:         store <2 x float> [[VEC_A_EXT]], ptr {{%.*}}, align 4
-; CHECK:         store <2 x float> [[VEC_B_EXT]], ptr {{%.*}}, align 4
-entry:
-  br label %for.body
-
-for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
-  %in_val = load float, ptr %arrayidx, align 4
-  ; #3 does not have a fixed-size vector mapping (so replication is used)
-  %call = tail call { float, float } @foo(float %in_val) #3
-  %extract_a = extractvalue { float, float } %call, 0
-  %extract_b = extractvalue { float, float } %call, 1
-  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
-  store float %extract_a, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
-  store float %extract_b, ptr %arrayidx4, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
 ; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks


        


More information about the llvm-commits mailing list