[llvm] [VectorCombine] isExtractExtractCheap - specify the extract/insert shuffle mask to improve shuffle costs (PR #114780)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 11 04:22:58 PST 2024


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/114780

>From 9bfddafabd11b771f7c2781d5568fc73d5ef5efa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 4 Nov 2024 12:01:23 +0000
Subject: [PATCH 1/5] [VectorCombine] isExtractExtractCheap - specify the
 extract/insert shuffle mask to improve shuffle costs

This shuffle mask is so focused, the cost model is very likely to be able to determine a specific (lower) cost
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 14 +++-
 .../Transforms/PhaseOrdering/X86/pr50392.ll   | 29 +++-----
 .../load-extractelement-scalarization.ll      | 72 +++++++++----------
 .../X86/extract-binop-inseltpoison.ll         | 16 +++--
 .../VectorCombine/X86/extract-binop.ll        | 16 +++--
 5 files changed, 80 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b8754b03c2ebc65..98b293898cccf03 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -449,6 +449,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   // TODO: Evaluate whether that always results in lowest cost. Alternatively,
   //       check the cost of creating a broadcast shuffle and shuffling both
   //       operands to element 0.
+  unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
+  unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
   InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
 
   // Extra uses of the extracts mean that we include those costs in the
@@ -484,8 +486,16 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
     // ShufMask = { poison, poison, 0, poison }
     // TODO: The cost model has an option for a "broadcast" shuffle
     //       (splat-from-element-0), but no option for a more general splat.
-    NewCost +=
-        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+    if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
+      SmallVector<int> ShuffleMask;
+      ShuffleMask.append(FixedVecTy->getNumElements(), PoisonMaskElem);
+      ShuffleMask[BestInsIndex] = BestExtIndex;
+      NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    VecTy, ShuffleMask);
+    } else {
+      NewCost +=
+          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+    }
   }
 
   // Aggressively form a vector op if the cost is equal because the transform
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index d0568f3b961fdef..82e92be7c21bcc9 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -30,24 +30,14 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD12]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[SHUFFLE]]
 ;
-; AVX1-LABEL: @PR50392(
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
-; AVX1-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
-; AVX1-NEXT:    [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
-; AVX1-NEXT:    [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
-; AVX1-NEXT:    [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP4]], double [[ADD12]], i64 3
-; AVX1-NEXT:    ret <4 x double> [[SHUFFLE]]
-;
-; AVX2-LABEL: @PR50392(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
-; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX2-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX2-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP5]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
-; AVX2-NEXT:    ret <4 x double> [[SHUFFLE]]
+; AVX-LABEL: @PR50392(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    ret <4 x double> [[SHUFFLE]]
 ;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
@@ -69,5 +59,6 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
+; AVX1: {{.*}}
+; AVX2: {{.*}}
 ; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index 613c71cb769b0c9..551d6d1cabd41e5 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -541,18 +541,18 @@ define i32 @load_extract_clobber_store_between(ptr %x, ptr %y) {
 define i32 @load_extract_clobber_store_between_limit(ptr %x, ptr %y, <8 x i32> %z) {
 ; CHECK-LABEL: @load_extract_clobber_store_between_limit(
 ; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[Z_0:%.*]] = extractelement <8 x i32> [[Z:%.*]], i32 0
-; CHECK-NEXT:    [[Z_1:%.*]] = extractelement <8 x i32> [[Z]], i32 1
-; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[Z_0]], [[Z_1]]
-; CHECK-NEXT:    [[Z_2:%.*]] = extractelement <8 x i32> [[Z]], i32 2
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[Z_2]]
-; CHECK-NEXT:    [[Z_3:%.*]] = extractelement <8 x i32> [[Z]], i32 3
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[Z_3]]
-; CHECK-NEXT:    [[Z_4:%.*]] = extractelement <8 x i32> [[Z]], i32 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[Z_4]]
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[Z1:%.*]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[Z1]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[TMP1]], [[SHIFT1]]
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[SHIFT2]]
+; CHECK-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[Z:%.*]] = add <8 x i32> [[TMP3]], [[SHIFT3]]
+; CHECK-NEXT:    [[Z_0:%.*]] = extractelement <8 x i32> [[Z]], i32 0
 ; CHECK-NEXT:    store i8 0, ptr [[Y:%.*]], align 1
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2
-; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[R]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[Z_0]], [[R]]
 ; CHECK-NEXT:    ret i32 [[ADD_4]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -573,35 +573,35 @@ define i32 @load_extract_clobber_store_between_limit(ptr %x, ptr %y, <8 x i32> %
 
 define i32 @load_extract_clobber_store_after_limit(ptr %x, ptr %y, <8 x i32> %z) {
 ; LIMIT-DEFAULT-LABEL: @load_extract_clobber_store_after_limit(
-; LIMIT-DEFAULT-NEXT:    [[Z_0:%.*]] = extractelement <8 x i32> [[Z:%.*]], i32 0
-; LIMIT-DEFAULT-NEXT:    [[Z_1:%.*]] = extractelement <8 x i32> [[Z]], i32 1
-; LIMIT-DEFAULT-NEXT:    [[ADD_0:%.*]] = add i32 [[Z_0]], [[Z_1]]
-; LIMIT-DEFAULT-NEXT:    [[Z_2:%.*]] = extractelement <8 x i32> [[Z]], i32 2
-; LIMIT-DEFAULT-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[Z_2]]
-; LIMIT-DEFAULT-NEXT:    [[Z_3:%.*]] = extractelement <8 x i32> [[Z]], i32 3
-; LIMIT-DEFAULT-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[Z_3]]
-; LIMIT-DEFAULT-NEXT:    [[Z_4:%.*]] = extractelement <8 x i32> [[Z]], i32 4
-; LIMIT-DEFAULT-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[Z_4]]
+; LIMIT-DEFAULT-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[Z1:%.*]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT-DEFAULT-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[Z1]], [[SHIFT]]
+; LIMIT-DEFAULT-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT-DEFAULT-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[TMP4]], [[SHIFT1]]
+; LIMIT-DEFAULT-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT-DEFAULT-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[SHIFT2]]
+; LIMIT-DEFAULT-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT-DEFAULT-NEXT:    [[Z:%.*]] = add <8 x i32> [[TMP3]], [[SHIFT3]]
+; LIMIT-DEFAULT-NEXT:    [[Z_0:%.*]] = extractelement <8 x i32> [[Z]], i32 0
 ; LIMIT-DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[X:%.*]], i32 0, i32 2
 ; LIMIT-DEFAULT-NEXT:    [[R:%.*]] = load i32, ptr [[TMP1]], align 8
 ; LIMIT-DEFAULT-NEXT:    store i8 0, ptr [[Y:%.*]], align 1
-; LIMIT-DEFAULT-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[R]]
+; LIMIT-DEFAULT-NEXT:    [[ADD_4:%.*]] = add i32 [[Z_0]], [[R]]
 ; LIMIT-DEFAULT-NEXT:    ret i32 [[ADD_4]]
 ;
 ; LIMIT2-LABEL: @load_extract_clobber_store_after_limit(
 ; LIMIT2-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; LIMIT2-NEXT:    [[Z_0:%.*]] = extractelement <8 x i32> [[Z:%.*]], i32 0
-; LIMIT2-NEXT:    [[Z_1:%.*]] = extractelement <8 x i32> [[Z]], i32 1
-; LIMIT2-NEXT:    [[ADD_0:%.*]] = add i32 [[Z_0]], [[Z_1]]
-; LIMIT2-NEXT:    [[Z_2:%.*]] = extractelement <8 x i32> [[Z]], i32 2
-; LIMIT2-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[Z_2]]
-; LIMIT2-NEXT:    [[Z_3:%.*]] = extractelement <8 x i32> [[Z]], i32 3
-; LIMIT2-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[Z_3]]
-; LIMIT2-NEXT:    [[Z_4:%.*]] = extractelement <8 x i32> [[Z]], i32 4
-; LIMIT2-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[Z_4]]
+; LIMIT2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[Z1:%.*]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[Z1]], [[SHIFT]]
+; LIMIT2-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT2-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[TMP1]], [[SHIFT1]]
+; LIMIT2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[SHIFT2]]
+; LIMIT2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; LIMIT2-NEXT:    [[Z:%.*]] = add <8 x i32> [[TMP3]], [[SHIFT3]]
+; LIMIT2-NEXT:    [[Z_0:%.*]] = extractelement <8 x i32> [[Z]], i32 0
 ; LIMIT2-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2
 ; LIMIT2-NEXT:    store i8 0, ptr [[Y:%.*]], align 1
-; LIMIT2-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[R]]
+; LIMIT2-NEXT:    [[ADD_4:%.*]] = add i32 [[Z_0]], [[R]]
 ; LIMIT2-NEXT:    ret i32 [[ADD_4]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -671,9 +671,9 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) {
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
 ; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i32 0
-; CHECK-NEXT:    [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i32 1
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
+; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -688,9 +688,9 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
 ; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[E_0:%.*]] = extractelement <8 x i32> [[LV]], i32 0
-; CHECK-NEXT:    [[E_1:%.*]] = extractelement <8 x i32> [[LV]], i32 6
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]]
+; CHECK-NEXT:    [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <8 x i32>, ptr %x, align 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 3d69f15fc5f249f..effd846e7e6336e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -404,11 +404,17 @@ define float @ext0_ext8_fmul_v16f32(<16 x float> %x) {
 }
 
 define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
-; CHECK-LABEL: @ext14_ext15_fmul_v16f32(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
-; CHECK-NEXT:    ret float [[R]]
+; SSE-LABEL: @ext14_ext15_fmul_v16f32(
+; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14
+; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15
+; SSE-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
+; SSE-NEXT:    ret float [[R]]
+;
+; AVX-LABEL: @ext14_ext15_fmul_v16f32(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[X]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; AVX-NEXT:    ret float [[R]]
 ;
   %e0 = extractelement <16 x float> %x, i32 14
   %e1 = extractelement <16 x float> %x, i32 15
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 52f7cd859a1ab1c..7481ae0de4ad840 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -404,11 +404,17 @@ define float @ext0_ext8_fmul_v16f32(<16 x float> %x) {
 }
 
 define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
-; CHECK-LABEL: @ext14_ext15_fmul_v16f32(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
-; CHECK-NEXT:    ret float [[R]]
+; SSE-LABEL: @ext14_ext15_fmul_v16f32(
+; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14
+; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15
+; SSE-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
+; SSE-NEXT:    ret float [[R]]
+;
+; AVX-LABEL: @ext14_ext15_fmul_v16f32(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[X]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; AVX-NEXT:    ret float [[R]]
 ;
   %e0 = extractelement <16 x float> %x, i32 14
   %e1 = extractelement <16 x float> %x, i32 15

>From f93e617e90705b8c6c23b0455d6b5f15384a09ab Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 4 Nov 2024 14:51:20 +0000
Subject: [PATCH 2/5] Cleanup SmallVector initialization

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 98b293898cccf03..fa962a29dc350b5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -487,8 +487,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
     // TODO: The cost model has an option for a "broadcast" shuffle
     //       (splat-from-element-0), but no option for a more general splat.
     if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
-      SmallVector<int> ShuffleMask;
-      ShuffleMask.append(FixedVecTy->getNumElements(), PoisonMaskElem);
+      SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
+                                   PoisonMaskElem);
       ShuffleMask[BestInsIndex] = BestExtIndex;
       NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
                                     VecTy, ShuffleMask);

>From 6a3a4d0c98b08d66459c15c04dc53bdfd9fd9f8c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 6 Nov 2024 12:35:47 +0000
Subject: [PATCH 3/5] Add explicit getShuffleCost arguments for CostKind and
 the source vector

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fa962a29dc350b5..23d8ca8e77ea192 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -451,6 +451,7 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   //       operands to element 0.
   unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
   unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
+  Value *BestVec = (Extract0Cost > Extract1Cost ? Ext1 : Ext0)->getOperand(0);
   InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
 
   // Extra uses of the extracts mean that we include those costs in the
@@ -490,11 +491,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
       SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
                                    PoisonMaskElem);
       ShuffleMask[BestInsIndex] = BestExtIndex;
-      NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                    VecTy, ShuffleMask);
-    } else {
       NewCost +=
-          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
+                             ShuffleMask, CostKind, 0, nullptr, {BestVec});
+    } else {
+      NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    VecTy, {}, CostKind, 0, nullptr, {BestVec});
     }
   }
 

>From 91c1dfacfa7a0f32abb2e5391e07eaeee3834023 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 6 Nov 2024 12:46:08 +0000
Subject: [PATCH 4/5] Cleanup check-prefixes

---
 .../Transforms/PhaseOrdering/X86/pr50392.ll   | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index 82e92be7c21bcc9..4e1051d1991aa8d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE,SSE4
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE,SSE4
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
 
 define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
 ; SSE2-LABEL: @PR50392(
@@ -58,7 +58,3 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
   %shuffle = shufflevector <4 x double> %vecinit13, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
   ret <4 x double> %shuffle
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1: {{.*}}
-; AVX2: {{.*}}
-; SSE: {{.*}}

>From c1f9c24308db310c9527c13ad5da76d38da7b970 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 7 Nov 2024 11:53:48 +0000
Subject: [PATCH 5/5] Use ConvertToShuffle vector source value directly

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 23d8ca8e77ea192..d4d0f3eac182a75 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -451,7 +451,6 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   //       operands to element 0.
   unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
   unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
-  Value *BestVec = (Extract0Cost > Extract1Cost ? Ext1 : Ext0)->getOperand(0);
   InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
 
   // Extra uses of the extracts mean that we include those costs in the
@@ -491,12 +490,13 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
       SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
                                    PoisonMaskElem);
       ShuffleMask[BestInsIndex] = BestExtIndex;
+      NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    VecTy, ShuffleMask, CostKind, 0, nullptr,
+                                    {ConvertToShuffle});
+    } else {
       NewCost +=
           TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
-                             ShuffleMask, CostKind, 0, nullptr, {BestVec});
-    } else {
-      NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                    VecTy, {}, CostKind, 0, nullptr, {BestVec});
+                             {}, CostKind, 0, nullptr, {ConvertToShuffle});
     }
   }
 



More information about the llvm-commits mailing list