[llvm] r337357 - [X86] Enable commuting of VUNPCKHPD to VMOVLHPS to enable load folding by using VMOVLPS with a modified address.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 18 00:31:32 PDT 2018
Author: ctopper
Date: Wed Jul 18 00:31:32 2018
New Revision: 337357
URL: http://llvm.org/viewvc/llvm-project?rev=337357&view=rev
Log:
[X86] Enable commuting of VUNPCKHPD to VMOVLHPS to enable load folding by using VMOVLPS with a modified address.
This required an annoying amount of tablegen multiclass changes to make only VUNPCKHPDZ128rr commutable.
Modified:
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/test/CodeGen/X86/fma.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=337357&r1=337356&r2=337357&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Wed Jul 18 00:31:32 2018
@@ -198,7 +198,8 @@ multiclass AVX512_maskable_custom<bits<8
list<dag> ZeroMaskingPattern,
string MaskingConstraint = "",
bit IsCommutable = 0,
- bit IsKCommutable = 0> {
+ bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable> {
let isCommutable = IsCommutable in
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
@@ -218,7 +219,7 @@ multiclass AVX512_maskable_custom<bits<8
// Zero mask does not add any restrictions to commute operands transformation.
// So, it is Ok to use IsCommutable instead of IsKCommutable.
- let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+ let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<>
def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -237,7 +238,8 @@ multiclass AVX512_maskable_common<bits<8
SDNode Select = vselect,
string MaskingConstraint = "",
bit IsCommutable = 0,
- bit IsKCommutable = 0> :
+ bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable> :
AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,
[(set _.RC:$dst, RHS)],
@@ -245,7 +247,7 @@ multiclass AVX512_maskable_common<bits<8
[(set _.RC:$dst,
(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
MaskingConstraint, IsCommutable,
- IsKCommutable>;
+ IsKCommutable, IsKZCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -276,13 +278,15 @@ multiclass AVX512_maskable<bits<8> O, Fo
string AttSrcAsm, string IntelSrcAsm,
dag RHS,
bit IsCommutable = 0, bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable,
SDNode Select = vselect> :
AVX512_maskable_common<O, F, _, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(Select _.KRCWM:$mask, RHS, _.RC:$src0),
- Select, "$src0 = $dst", IsCommutable, IsKCommutable>;
+ Select, "$src0 = $dst", IsCommutable, IsKCommutable,
+ IsKZCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the scalar instruction.
@@ -292,7 +296,7 @@ multiclass AVX512_maskable_scalar<bits<8
dag RHS,
bit IsCommutable = 0> :
AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
- RHS, IsCommutable, 0, X86selects>;
+ RHS, IsCommutable, 0, IsCommutable, X86selects>;
// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved
@@ -5312,12 +5316,14 @@ defm VMAXCSDZ : avx512_comutable_binop_s
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
- bit IsCommutable> {
+ bit IsCommutable,
+ bit IsKZCommutable = IsCommutable> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable>,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0,
+ IsKZCommutable>,
EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in {
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -5361,7 +5367,8 @@ multiclass avx512_fp_sae_packed<bits<8>
multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
Predicate prd, X86SchedWriteSizes sched,
- bit IsCommutable = 0> {
+ bit IsCommutable = 0,
+ bit IsPD128Commutable = IsCommutable> {
let Predicates = [prd] in {
defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
@@ -5380,7 +5387,8 @@ multiclass avx512_fp_binop_p<bits<8> opc
sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
EVEX_CD8<32, CD8VF>;
defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
- sched.PD.XMM, IsCommutable>, EVEX_V128, PD, VEX_W,
+ sched.PD.XMM, IsPD128Commutable,
+ IsCommutable>, EVEX_V128, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W,
@@ -6426,6 +6434,7 @@ def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrc
"vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>,
Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+let isCommutable = 1 in
def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -10854,7 +10863,7 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (
//===----------------------------------------------------------------------===//
defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
- SchedWriteFShuffleSizes>;
+ SchedWriteFShuffleSizes, 0, 1>;
defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
SchedWriteFShuffleSizes>;
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=337357&r1=337356&r2=337357&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Jul 18 00:31:32 2018
@@ -1692,14 +1692,22 @@ MachineInstr *X86InstrInfo::commuteInstr
OpIdx1, OpIdx2);
}
case X86::MOVHLPSrr:
- case X86::UNPCKHPDrr: {
+ case X86::UNPCKHPDrr:
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ case X86::VMOVHLPSZrr:
+ case X86::VUNPCKHPDZ128rr: {
assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
unsigned Opc = MI.getOpcode();
switch (Opc) {
- default: llvm_unreachable("Unreachable!");
- case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
- case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
+ case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
+ case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
+ case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
+ case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
@@ -1990,6 +1998,10 @@ bool X86InstrInfo::findCommutedOpIndices
return false;
case X86::MOVHLPSrr:
case X86::UNPCKHPDrr:
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ case X86::VMOVHLPSZrr:
+ case X86::VUNPCKHPDZ128rr:
if (Subtarget.hasSSE2())
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return false;
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=337357&r1=337356&r2=337357&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Wed Jul 18 00:31:32 2018
@@ -812,6 +812,7 @@ let Predicates = [UseAVX] in {
[(set VR128:$dst,
(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
+ let isCommutable = 1 in
def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2190,7 +2191,7 @@ defm VUNPCKHPS: sse12_unpack_interleave<
SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+ SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
Modified: llvm/trunk/test/CodeGen/X86/fma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma.ll?rev=337357&r1=337356&r2=337357&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fma.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fma.ll Wed Jul 18 00:31:32 2018
@@ -1410,10 +1410,10 @@ define <2 x double> @test_v2f64(<2 x dou
; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30]
; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x44,0x24,0x40]
-; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1]
+; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58]
+; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3]
; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
@@ -1547,10 +1547,10 @@ define <4 x double> @test_v4f64(<4 x dou
; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70]
; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x44,0x24,0x50]
-; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1]
+; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68]
+; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3]
; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30]
; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
@@ -1809,10 +1809,10 @@ define <8 x double> @test_v8f64(<8 x dou
; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x50,0x01,0x00,0x00]
-; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x40,0x01,0x00,0x00]
-; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1]
+; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00]
+; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00]
+; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3]
; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60]
; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll?rev=337357&r1=337356&r2=337357&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll Wed Jul 18 00:31:32 2018
@@ -1310,8 +1310,7 @@ define <2 x double> @shuffle_mem_v2f64_3
;
; AVX-LABEL: shuffle_mem_v2f64_31:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; AVX-NEXT: retq
%c = load <2 x double>, <2 x double>* %b
%f = shufflevector <2 x double> %a, <2 x double> %c, <2 x i32> <i32 3, i32 1>
More information about the llvm-commits
mailing list