[llvm] 6b29a6f - [X86] Add support for using Sched/Codesize information to `X86FixupInstTuning` Pass.
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 8 09:57:57 PST 2023
Author: Noah Goldstein
Date: 2023-03-08T11:57:38-06:00
New Revision: 6b29a6f27d40cfc4225fe14d0f0a298fde611cad
URL: https://github.com/llvm/llvm-project/commit/6b29a6f27d40cfc4225fe14d0f0a298fde611cad
DIFF: https://github.com/llvm/llvm-project/commit/6b29a6f27d40cfc4225fe14d0f0a298fde611cad.diff
LOG: [X86] Add support for using Sched/Codesize information to `X86FixupInstTuning` Pass.
Use this to handle new transform: `{v}unpck{l|h}pd` -> `{v}shufps`. We
need the sched information here as `{v}shufps` is 1 more byte of code
size, so we only want to make this transformation if `{v}shufps` is
actually faster.
Differential Revision: https://reviews.llvm.org/D144570
Added:
Modified:
llvm/lib/Target/X86/X86FixupInstTuning.cpp
llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 7d0103793142..662cdf9adee7 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -57,6 +57,7 @@ class X86FixupInstTuningPass : public MachineFunctionPass {
private:
const X86InstrInfo *TII = nullptr;
const X86Subtarget *ST = nullptr;
+ const MCSchedModel *SM = nullptr;
};
} // end anonymous namespace
@@ -68,6 +69,14 @@ FunctionPass *llvm::createX86FixupInstTuning() {
return new X86FixupInstTuningPass();
}
+template <typename T>
+static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
+ if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
+ return *NewVal < *CurVal;
+
+ return std::nullopt;
+}
+
bool X86FixupInstTuningPass::processInstruction(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &I) {
@@ -75,10 +84,54 @@ bool X86FixupInstTuningPass::processInstruction(
unsigned Opc = MI.getOpcode();
unsigned NumOperands = MI.getDesc().getNumOperands();
+ auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
+ // We already checked that SchedModel exists in `NewOpcPreferable`.
+ return MCSchedModel::getReciprocalThroughput(
+ *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
+ };
+
+ auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
+ // We already checked that SchedModel exists in `NewOpcPreferable`.
+ return MCSchedModel::computeInstrLatency(
+ *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
+ };
+
+ auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
+ if (unsigned Size = TII->get(Opcode).getSize())
+ return Size;
+ // Zero size means we where unable to compute it.
+ return std::nullopt;
+ };
+
+ auto NewOpcPreferable = [&](unsigned NewOpc,
+ bool ReplaceInTie = true) -> bool {
+ std::optional<bool> Res;
+ if (SM->hasInstrSchedModel()) {
+ // Compare tput -> lat -> code size.
+ Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
+ if (Res.has_value())
+ return *Res;
+
+ Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
+ if (Res.has_value())
+ return *Res;
+ }
+
+ Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
+ if (Res.has_value())
+ return *Res;
+
+ // We either have either were unable to get tput/lat/codesize or all values
+ // were equal. Return specified option for a tie.
+ return ReplaceInTie;
+ };
+
// `vpermilps r, i` -> `vshufps r, r, i`
- // `vshufps` is always as fast or faster than `vpermilps` and takes 1 less
- // byte of code size.
+ // `vshufps` is always as fast or faster than
+ // `vpermilps` and takes 1 less byte of code size.
auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
+ if (!NewOpcPreferable(NewOpc))
+ return false;
unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
MI.removeOperand(NumOperands - 1);
MI.addOperand(MI.getOperand(1));
@@ -93,11 +146,33 @@ bool X86FixupInstTuningPass::processInstruction(
auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
// TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
// `vpshufd` saves a byte of code size.
- if (!ST->hasNoDomainDelayShuffle())
+ if (!ST->hasNoDomainDelayShuffle() &&
+ !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
+ return false;
+ MI.setDesc(TII->get(NewOpc));
+ return true;
+ };
+
+ // `vunpcklpd/vmovlhps r, r` -> `vshufps r, r, 0x44`
+ // `vunpckhpd/vmovlhps r, r` -> `vshufps r, r, 0xee`
+ // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with
+ // `vunpck{l|h}pd` as it uses less code size.
+ // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS`
+ // as the replacement. `{VP}UNPCK{L|H}QDQ{...}` has no codesize cost.
+ auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
+ if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
return false;
+
MI.setDesc(TII->get(NewOpc));
+ MI.addOperand(MachineOperand::CreateImm(MaskImm));
return true;
};
+ auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool {
+ return ProcessUNPCKPD(NewOpc, 0x44);
+ };
+ auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool {
+ return ProcessUNPCKPD(NewOpc, 0xee);
+ };
// TODO: Add masked predicate execution variants.
switch (Opc) {
@@ -123,6 +198,41 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
case X86::VPERMILPSZmi:
return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
+
+ // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to
+ // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as
+ // `{V}SHUF{...}PS` but 1) without increasing code size and 2) can also
+ // handle the `mr` case. ICL doesn't have a domain penalty for replacing
+ // float unpck -> int unpck, but at this time, I haven't verified the set of
+ // processors where its safe.
+ case X86::MOVLHPSrr:
+ case X86::UNPCKLPDrr:
+ return ProcessUNPCKLPDrr(X86::SHUFPSrri);
+ case X86::VMOVLHPSrr:
+ case X86::VUNPCKLPDrr:
+ return ProcessUNPCKLPDrr(X86::VSHUFPSrri);
+ case X86::VUNPCKLPDYrr:
+ return ProcessUNPCKLPDrr(X86::VSHUFPSYrri);
+ // VMOVLHPS is always 128 bits.
+ case X86::VMOVLHPSZrr:
+ case X86::VUNPCKLPDZ128rr:
+ return ProcessUNPCKLPDrr(X86::VSHUFPSZ128rri);
+ case X86::VUNPCKLPDZ256rr:
+ return ProcessUNPCKLPDrr(X86::VSHUFPSZ256rri);
+ case X86::VUNPCKLPDZrr:
+ return ProcessUNPCKLPDrr(X86::VSHUFPSZrri);
+ case X86::UNPCKHPDrr:
+ return ProcessUNPCKHPDrr(X86::SHUFPSrri);
+ case X86::VUNPCKHPDrr:
+ return ProcessUNPCKHPDrr(X86::VSHUFPSrri);
+ case X86::VUNPCKHPDYrr:
+ return ProcessUNPCKHPDrr(X86::VSHUFPSYrri);
+ case X86::VUNPCKHPDZ128rr:
+ return ProcessUNPCKHPDrr(X86::VSHUFPSZ128rri);
+ case X86::VUNPCKHPDZ256rr:
+ return ProcessUNPCKHPDrr(X86::VSHUFPSZ256rri);
+ case X86::VUNPCKHPDZrr:
+ return ProcessUNPCKHPDrr(X86::VSHUFPSZrri);
default:
return false;
}
@@ -133,6 +243,8 @@ bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
ST = &MF.getSubtarget<X86Subtarget>();
TII = ST->getInstrInfo();
+ SM = &ST->getSchedModel();
+
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
if (processInstruction(MF, MBB, I)) {
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
index 5b96196a4ec5..67f27544150c 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
@@ -25,37 +25,117 @@ define <16 x float> @transform_VUNPCKHPDZrr(<16 x float> %a, <16 x float> %b) no
}
define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5]
+; CHECK-ICX-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ZNVER4-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
ret <8 x float> %shufp
}
define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7]
+; CHECK-ICX-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ZNVER4-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
ret <8 x float> %shufp
}
define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDrr:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; CHECK-ICX-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDrr:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDrr:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ZNVER4-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shufp
}
define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDrr:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,3]
+; CHECK-ICX-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDrr:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDrr:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ZNVER4-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x float> %shufp
}
@@ -431,9 +511,3 @@ define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
%res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c
ret <2 x double> %res
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-AVX512: {{.*}}
-; CHECK-ICX: {{.*}}
-; CHECK-SKX: {{.*}}
-; CHECK-V4: {{.*}}
-; CHECK-ZNVER4: {{.*}}
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
index ecdb60481850..9de90fd8da4a 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
@@ -1,40 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,CHECK-SKL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-V3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX2,CHECK-SKL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX2,CHECK-V3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-ICX
define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5]
+; CHECK-ICX-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
ret <8 x float> %shufp
}
define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7]
+; CHECK-ICX-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
ret <8 x float> %shufp
}
define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKLPDrr:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; CHECK-ICX-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shufp
}
define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKHPDrr:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,3]
+; CHECK-ICX-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x float> %shufp
}
@@ -79,6 +99,5 @@ define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
ret <4 x float> %shufp
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ICX: {{.*}}
; CHECK-SKL: {{.*}}
; CHECK-V3: {{.*}}
More information about the llvm-commits
mailing list