[PATCH] D37653: [X86] Add isel pattern infrastructure to begin recognizing when we're inserting 0s into the upper portions of a vector register and the producing instruction as already produced the zeros.
Phabricator via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 15 10:10:32 PDT 2017
This revision was automatically updated to reflect the committed changes.
Closed by commit rL313365: [X86] Add isel pattern infrastructure to begin recognizing when we're inserting… (authored by ctopper).
Changed prior to commit:
https://reviews.llvm.org/D37653?vs=114461&id=115419#toc
Repository:
rL LLVM
https://reviews.llvm.org/D37653
Files:
llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td
llvm/trunk/test/CodeGen/X86/madd.ll
Index: llvm/trunk/test/CodeGen/X86/madd.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/madd.ll
+++ llvm/trunk/test/CodeGen/X86/madd.ll
@@ -40,7 +40,6 @@
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1
; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa %xmm1, %xmm1
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: addq $8, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
@@ -65,7 +64,6 @@
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1
; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, %xmm1
; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: addq $8, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
@@ -314,7 +312,6 @@
; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1
; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2
; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vmovdqa %ymm1, %ymm1
; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: addq $16, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
Index: llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td
===================================================================
--- llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td
+++ llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td
@@ -360,3 +360,62 @@
defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32,
loadv4i64, sub_ymm>;
}
+
+// List of opcodes that guaranteed to zero the upper elements of vector regs.
+// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA
+// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make
+// this difficult. So starting with a couple opcodes used by reduction loops
+// where we explicitly insert zeros.
+class veczeroupper<ValueType vt, RegisterClass RC> :
+ PatLeaf<(vt RC:$src), [{
+ return N->getOpcode() == X86ISD::VPMADDWD;
+ }]>;
+
+def zeroupperv2f64 : veczeroupper<v2f64, VR128>;
+def zeroupperv4f32 : veczeroupper<v4f32, VR128>;
+def zeroupperv2i64 : veczeroupper<v2i64, VR128>;
+def zeroupperv4i32 : veczeroupper<v4i32, VR128>;
+def zeroupperv8i16 : veczeroupper<v8i16, VR128>;
+def zeroupperv16i8 : veczeroupper<v16i8, VR128>;
+
+def zeroupperv4f64 : veczeroupper<v4f64, VR256>;
+def zeroupperv8f32 : veczeroupper<v8f32, VR256>;
+def zeroupperv4i64 : veczeroupper<v4i64, VR256>;
+def zeroupperv8i32 : veczeroupper<v8i32, VR256>;
+def zeroupperv16i16 : veczeroupper<v16i16, VR256>;
+def zeroupperv32i8 : veczeroupper<v32i8, VR256>;
+
+
+// If we can guarantee the upper elements have already been zeroed we can elide
+// an explicit zeroing.
+multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy,
+ ValueType SrcTy, ValueType ZeroTy,
+ SubRegIndex SubIdx, PatLeaf Zeroupper> {
+ def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
+ Zeroupper:$src, (iPTR 0))),
+ (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>;
+}
+
+// 128->256
+defm: subvector_zero_ellision<VR128, v4f64, v2f64, v8i32, sub_xmm, zeroupperv2f64>;
+defm: subvector_zero_ellision<VR128, v8f32, v4f32, v8i32, sub_xmm, zeroupperv4f32>;
+defm: subvector_zero_ellision<VR128, v4i64, v2i64, v8i32, sub_xmm, zeroupperv2i64>;
+defm: subvector_zero_ellision<VR128, v8i32, v4i32, v8i32, sub_xmm, zeroupperv4i32>;
+defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>;
+defm: subvector_zero_ellision<VR128, v32i8, v16i8, v8i32, sub_xmm, zeroupperv16i8>;
+
+// 128->512
+defm: subvector_zero_ellision<VR128, v8f64, v2f64, v16i32, sub_xmm, zeroupperv2f64>;
+defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>;
+defm: subvector_zero_ellision<VR128, v8i64, v2i64, v16i32, sub_xmm, zeroupperv2i64>;
+defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>;
+defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>;
+defm: subvector_zero_ellision<VR128, v64i8, v16i8, v16i32, sub_xmm, zeroupperv16i8>;
+
+// 256->512
+defm: subvector_zero_ellision<VR256, v8f64, v4f64, v16i32, sub_ymm, zeroupperv4f64>;
+defm: subvector_zero_ellision<VR256, v16f32, v8f32, v16i32, sub_ymm, zeroupperv8f32>;
+defm: subvector_zero_ellision<VR256, v8i64, v4i64, v16i32, sub_ymm, zeroupperv4i64>;
+defm: subvector_zero_ellision<VR256, v16i32, v8i32, v16i32, sub_ymm, zeroupperv8i32>;
+defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>;
+defm: subvector_zero_ellision<VR256, v64i8, v32i8, v16i32, sub_ymm, zeroupperv32i8>;
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D37653.115419.patch
Type: text/x-patch
Size: 4780 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170915/62cb1567/attachment.bin>
More information about the llvm-commits
mailing list