[llvm] r313476 - [X86] Add patterns to make blends with immediate control commutable during isel for load folding.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 16 22:06:05 PDT 2017
Author: ctopper
Date: Sat Sep 16 22:06:05 2017
New Revision: 313476
URL: http://llvm.org/viewvc/llvm-project?rev=313476&view=rev
Log:
[X86] Add patterns to make blends with immediate control commutable during isel for load folding.
Modified:
llvm/trunk/lib/Target/X86/X86InstrSSE.td
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=313476&r1=313475&r2=313476&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Sat Sep 16 22:06:05 2017
@@ -6542,6 +6542,21 @@ multiclass SS41I_binop_rmi<bits<8> opc,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
+def BlendCommuteImm2 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0x03;
+ return getI8Imm(Imm ^ 0x03, SDLoc(N));
+}]>;
+
+def BlendCommuteImm4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0x0f;
+ return getI8Imm(Imm ^ 0x0f, SDLoc(N));
+}]>;
+
+def BlendCommuteImm8 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0xff;
+ return getI8Imm(Imm ^ 0xff, SDLoc(N));
+}]>;
+
let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@@ -6549,26 +6564,6 @@ let Predicates = [HasAVX] in {
DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
}
- let ExeDomain = SSEPackedSingle in {
- defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
- VR128, loadv4f32, f128mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
- defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
- VR256, loadv8f32, f256mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
- }
- let ExeDomain = SSEPackedDouble in {
- defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
- VR128, loadv2f64, f128mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
- defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
- VR256, loadv4f64, f256mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
- }
- defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
- VR128, loadv2i64, i128mem, 0,
- DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG;
-
let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
VR128, loadv4f32, f128mem, 0,
@@ -6589,9 +6584,6 @@ let Predicates = [HasAVX2] in {
VR256, loadv4i64, i256mem, 0,
DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
- defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
- VR256, loadv4i64, i256mem, 0,
- DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -6600,17 +6592,7 @@ let Constraints = "$src1 = $dst" in {
VR128, memopv2i64, i128mem,
1, SSE_MPSADBW_ITINS>;
}
- let ExeDomain = SSEPackedSingle in
- defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
- VR128, memopv4f32, f128mem,
- 1, SSE_INTALU_ITINS_FBLEND_P>;
- let ExeDomain = SSEPackedDouble in
- defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
- VR128, memopv2f64, f128mem,
- 1, SSE_INTALU_ITINS_FBLEND_P>;
- defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
- VR128, memopv2i64, i128mem,
- 1, SSE_INTALU_ITINS_BLEND_P>;
+
let ExeDomain = SSEPackedSingle in
defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
VR128, memopv4f32, f128mem, 1,
@@ -6621,6 +6603,82 @@ let Constraints = "$src1 = $dst" in {
SSE_DPPD_ITINS>;
}
+/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
+multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr, Domain d,
+ OpndItins itins, SDNodeXForm commuteXForm> {
+let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+ // Pattern to commute if load is in first source.
+ def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
+ RC:$src1, imm:$src3)),
+ (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
+ (commuteXForm imm:$src3))>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
+ VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+ VEX_4V, VEX_WIG;
+ defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
+ VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm8>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
+ VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm2>,
+ VEX_4V, VEX_WIG;
+ defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
+ VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
+ VR128, loadv2i64, i128mem, 0, SSEPackedInt,
+ DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+ VEX_4V, VEX_WIG;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
+ VR256, loadv4i64, i256mem, 0, SSEPackedInt,
+ DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+ VEX_4V, VEX_L, VEX_WIG;
+}
+
+defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
+ VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
+ SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm4>;
+defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
+ VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
+ SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm2>;
+defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
+ VR128, memopv2i64, i128mem, 1, SSEPackedInt,
+ SSE_INTALU_ITINS_BLEND_P, BlendCommuteImm8>;
+
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
let Predicates = [HasAVX] in {
@@ -7810,10 +7868,10 @@ let Predicates = [HasF16C, NoVLX] in {
// AVX2 Instructions
//===----------------------------------------------------------------------===//
-/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
-multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
+multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop> {
+ X86MemOperand x86memop, SDNodeXForm commuteXForm> {
let isCommutable = 1 in
def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -7829,12 +7887,19 @@ multiclass AVX2_binop_rmi<bits<8> opc, s
(OpVT (OpNode RC:$src1,
(bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+
+ // Pattern to commute if load is in first source.
+ def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
+ RC:$src1, imm:$src3)),
+ (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
+ (commuteXForm imm:$src3))>;
}
-defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
- VR128, loadv2i64, i128mem>;
-defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
- VR256, loadv4i64, i256mem>, VEX_L;
+defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
+ VR128, loadv2i64, i128mem, BlendCommuteImm4>;
+defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
+ VR256, loadv4i64, i256mem, BlendCommuteImm8>,
+ VEX_L;
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
More information about the llvm-commits
mailing list