[PATCH][AVX512] Add mem patterns for 512b FP rounds

Tue Feb 17 08:23:19 PST 2015

Hey guys,

Attached is a patch to add patterns for 512b FP rounds that fold a load.

I'm somewhat unhappy with the verbosity of this patch, but couldn't
find a more compact representation. Any suggestions?

Thanks,
Cameron
-------------- next part --------------
Index: lib/Target/X86/X86InstrAVX512.td
===================================================================

--- lib/Target/X86/X86InstrAVX512.td	(revision 229497)
+++ lib/Target/X86/X86InstrAVX512.td	(working copy)
@@ -4683,25 +4683,45 @@
             (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>;
 }
 
+def : Pat<(v16f32 (ffloor (loadv16f32 addr:$src))),
+          (VRNDSCALEPSZm addr:$src, (i32 0x1))>;
 def : Pat<(v16f32 (ffloor VR512:$src)),
           (VRNDSCALEPSZr VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint (loadv16f32 addr:$src))),
+          (VRNDSCALEPSZm addr:$src, (i32 0xC))>;
 def : Pat<(v16f32 (fnearbyint VR512:$src)),
           (VRNDSCALEPSZr VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil (loadv16f32 addr:$src))),
+          (VRNDSCALEPSZm addr:$src, (i32 0x2))>;
 def : Pat<(v16f32 (fceil VR512:$src)),
           (VRNDSCALEPSZr VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint (loadv16f32 addr:$src))),
+          (VRNDSCALEPSZm addr:$src, (i32 0x4))>;
 def : Pat<(v16f32 (frint VR512:$src)),
           (VRNDSCALEPSZr VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc (loadv16f32 addr:$src))),
+          (VRNDSCALEPSZm addr:$src, (i32 0x3))>;
 def : Pat<(v16f32 (ftrunc VR512:$src)),
           (VRNDSCALEPSZr VR512:$src, (i32 0x3))>;
 
+def : Pat<(v8f64 (ffloor (loadv8f64 addr:$src))),
+          (VRNDSCALEPDZm addr:$src, (i32 0x1))>;
 def : Pat<(v8f64 (ffloor VR512:$src)),
           (VRNDSCALEPDZr VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint (loadv8f64 addr:$src))),
+          (VRNDSCALEPDZm addr:$src, (i32 0xC))>;
 def : Pat<(v8f64 (fnearbyint VR512:$src)),
           (VRNDSCALEPDZr VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil (loadv8f64 addr:$src))),
+          (VRNDSCALEPDZm addr:$src, (i32 0x2))>;
 def : Pat<(v8f64 (fceil VR512:$src)),
           (VRNDSCALEPDZr VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint (loadv8f64 addr:$src))),
+          (VRNDSCALEPDZm addr:$src, (i32 0x4))>;
 def : Pat<(v8f64 (frint VR512:$src)),
           (VRNDSCALEPDZr VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc (loadv8f64 addr:$src))),
+          (VRNDSCALEPDZm addr:$src, (i32 0x3))>;
 def : Pat<(v8f64 (ftrunc VR512:$src)),
           (VRNDSCALEPDZr VR512:$src, (i32 0x3))>;
 
Index: test/CodeGen/X86/avx512-round.ll
===================================================================
--- test/CodeGen/X86/avx512-round.ll	(revision 229497)
+++ test/CodeGen/X86/avx512-round.ll	(working copy)
@@ -6,6 +6,13 @@
   %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
+define <16 x float> @floor_v16f32_mem(<16 x float>* %ptr) {
+; CHECK-LABEL: floor_v16f32_mem
+; CHECK: vrndscaleps $1, ({{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0x07,0x01]
+  %a = load <16 x float>* %ptr
+  %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
 declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
 
 define <8 x double> @floor_v8f64(<8 x double> %a) {
@@ -14,6 +21,13 @@
   %res = call <8 x double> @llvm.floor.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
+define <8 x double> @floor_v8f64_mem(<8 x double>* %ptr) {
+; CHECK-LABEL: floor_v8f64_mem
+; CHECK: vrndscalepd $1, ({{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0x07,0x01]
+  %a = load <8 x double>* %ptr
+  %res = call <8 x double> @llvm.floor.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
 declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
 
 define <16 x float> @ceil_v16f32(<16 x float> %a) {
@@ -22,6 +36,13 @@
   %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
+define <16 x float> @ceil_v16f32_mem(<16 x float>* %ptr) {
+; CHECK-LABEL: ceil_v16f32_mem
+; CHECK: vrndscaleps $2, ({{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0x07,0x02]
+  %a = load <16 x float>* %ptr
+  %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
 declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
 
 define <8 x double> @ceil_v8f64(<8 x double> %a) {
@@ -30,6 +51,13 @@
   %res = call <8 x double> @llvm.ceil.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
+define <8 x double> @ceil_v8f64_mem(<8 x double>* %ptr) {
+; CHECK-LABEL: ceil_v8f64_mem
+; CHECK: vrndscalepd $2, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0x07,0x02]
+  %a = load <8 x double>* %ptr
+  %res = call <8 x double> @llvm.ceil.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
 declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
 
 define <16 x float> @trunc_v16f32(<16 x float> %a) {
@@ -38,6 +66,13 @@
   %res = call <16 x float> @llvm.trunc.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
+define <16 x float> @trunc_v16f32_mem(<16 x float>* %ptr) {
+; CHECK-LABEL: trunc_v16f32_mem
+; CHECK: vrndscaleps $3, ({{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0x07,0x03]
+  %a = load <16 x float>* %ptr
+  %res = call <16 x float> @llvm.trunc.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
 declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
 
 define <8 x double> @trunc_v8f64(<8 x double> %a) {
@@ -46,6 +81,13 @@
   %res = call <8 x double> @llvm.trunc.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
+define <8 x double> @trunc_v8f64_mem(<8 x double>* %ptr) {
+; CHECK-LABEL: trunc_v8f64_mem
+; CHECK: vrndscalepd $3, ({{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0x07,0x03]
+  %a = load <8 x double>* %ptr
+  %res = call <8 x double> @llvm.trunc.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
 declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
 
 define <16 x float> @rint_v16f32(<16 x float> %a) {
@@ -54,6 +96,13 @@
   %res = call <16 x float> @llvm.rint.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
+define <16 x float> @rint_v16f32_mem(<16 x float>* %ptr) {
+; CHECK-LABEL: rint_v16f32_mem
+; CHECK: vrndscaleps $4, ({{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0x07,0x04]
+  %a = load <16 x float>* %ptr
+  %res = call <16 x float> @llvm.rint.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
 declare <16 x float> @llvm.rint.v16f32(<16 x float> %p)
 
 define <8 x double> @rint_v8f64(<8 x double> %a) {
@@ -62,6 +111,13 @@
   %res = call <8 x double> @llvm.rint.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
+define <8 x double> @rint_v8f64_mem(<8 x double>* %ptr) {
+; CHECK-LABEL: rint_v8f64_mem
+; CHECK: vrndscalepd $4, ({{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0x07,0x04]
+  %a = load <8 x double>* %ptr
+  %res = call <8 x double> @llvm.rint.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
 declare <8 x double> @llvm.rint.v8f64(<8 x double> %p)
 
 define <16 x float> @nearbyint_v16f32(<16 x float> %a) {
@@ -70,6 +126,13 @@
   %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
+define <16 x float> @nearbyint_v16f32_mem(<16 x float>* %ptr) {
+; CHECK-LABEL: nearbyint_v16f32_mem
+; CHECK: vrndscaleps $12, ({{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0x07,0x0c]
+  %a = load <16 x float>* %ptr
+  %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
 declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
 
 define <8 x double> @nearbyint_v8f64(<8 x double> %a) {
@@ -78,4 +141,11 @@
   %res = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
+define <8 x double> @nearbyint_v8f64_mem(<8 x double>* %ptr) {
+; CHECK-LABEL: nearbyint_v8f64_mem
+; CHECK: vrndscalepd $12, ({{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0x07,0x0c]
+  %a = load <8 x double>* %ptr
+  %res = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
 declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)