[llvm] 718d50d - [VectorCombine] foldPermuteOfBinops - prefer the new fold for matching costs.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 1 10:29:02 PDT 2024
Author: Simon Pilgrim
Date: 2024-11-01T17:28:37Z
New Revision: 718d50d6d03449962b14a3c8357a6ee3fa145f36
URL: https://github.com/llvm/llvm-project/commit/718d50d6d03449962b14a3c8357a6ee3fa145f36
DIFF: https://github.com/llvm/llvm-project/commit/718d50d6d03449962b14a3c8357a6ee3fa145f36.diff
LOG: [VectorCombine] foldPermuteOfBinops - prefer the new fold for matching costs.
Minor tweak to #114101 - as we're reducing the instruction count, we should prefer the fold if the old/new costs are the same.
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 3283cc8a229e5c..025234c54956b2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1478,7 +1478,9 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
- if (NewCost >= OldCost)
+
+ // If costs are equal, still fold as we reduce instruction count.
+ if (NewCost > OldCost)
return false;
Value *Shuf0 = Builder.CreateShuffleVector(Op00, Op01, NewMask0);
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index 53d4b1ad96cb82..d0568f3b961fde 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -1,30 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE4
; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE,SSE4
; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=AVX,AVX2
define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @PR50392(
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
-; SSE-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP4]], double [[ADD12]], i64 3
-; SSE-NEXT: ret <4 x double> [[SHUFFLE]]
+; SSE2-LABEL: @PR50392(
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SSE2-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
+; SSE2-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP4]], double [[ADD12]], i64 3
+; SSE2-NEXT: ret <4 x double> [[SHUFFLE]]
+;
+; SSE4-LABEL: @PR50392(
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SSE4-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
+; SSE4-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD12]], i64 3
+; SSE4-NEXT: ret <4 x double> [[SHUFFLE]]
;
; AVX1-LABEL: @PR50392(
-; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
+; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
+; AVX1-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
; AVX1-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
; AVX1-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
; AVX1-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
@@ -61,3 +70,4 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
index 6ff68f50db1b7a..ec9bfc542e32f5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
@@ -1,33 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE4
; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=SSE,SSE4
; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s | FileCheck %s --check-prefixes=AVX,AVX2
define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @PR94546(
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 6>
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 7>
-; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
-; SSE-NEXT: ret <4 x double> [[TMP4]]
+; SSE2-LABEL: @PR94546(
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; SSE2-NEXT: ret <4 x double> [[TMP4]]
;
-; AVX1-LABEL: @PR94546(
-; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 6>
-; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 7>
-; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
-; AVX1-NEXT: ret <4 x double> [[TMP4]]
+; SSE4-LABEL: @PR94546(
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
-; AVX2-LABEL: @PR94546(
-; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
-; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
-; AVX2-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT: ret <4 x double> [[TMP3]]
+; AVX-LABEL: @PR94546(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: ret <4 x double> [[TMP3]]
;
%vecext = extractelement <4 x double> %a, i32 0
%vecext1 = extractelement <4 x double> %a, i32 1
@@ -49,4 +48,6 @@ define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) {
ret <4 x double> %shuffle
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
+; AVX1: {{.*}}
+; AVX2: {{.*}}
+; SSE: {{.*}}
More information about the llvm-commits
mailing list