[PATCH] D150143: [X86] Add X86FixupVectorConstantsPass to fold vectors constant loads as broadcasts (WIP)

Simon Pilgrim via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Mon May 8 14:05:16 PDT 2023


RKSimon created this revision.
RKSimon added reviewers: pengfei, goldstein.w.n, craig.topper, andreadb.
Herald added a subscriber: hiraditya.
Herald added a project: All.
RKSimon requested review of this revision.
Herald added a project: LLVM.

This is WIP patch to remove the broadcasting of constants from the DAG and to instead perform this in a later pass, I'd like to hear people's thoughts on the approach while its still in the early stages.

The principal aim is to prevent the premature creation of broadcasts that prevent us folding the loads with another instruction, helping to reduce register pressure.

There's still a lot to be addressed in this early patch including:

- Subvector Broadcast handling (VBROADCASTF128 etc.).
- Folding of AVX512 constant loads (including masked loads) to AVX512 broadcasts.
- Folding of AVX512 instruction with folded constant loads to folded broadcasts.
- Better use of AVX (fp broadcasts) and SSE3 (movddup) broadcast instructionss - the comment printout are a mess of float / integer which we might want to address first?
- Remove the constant support entirely from lowerBuildVectorAsBroadcast in DAG

Later work includes:

- Use of VPMOVZ/VPMOVSX extension load for non-uniform constants that are representable with smaller integers
- Investigate possible rematerializable constants (shifted masks have been mentioned) to avoid loads entirely


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D150143

Files:
  llvm/lib/Target/X86/CMakeLists.txt
  llvm/lib/Target/X86/X86.h
  llvm/lib/Target/X86/X86FixupVectorConstants.cpp
  llvm/lib/Target/X86/X86ISelLowering.cpp
  llvm/lib/Target/X86/X86TargetMachine.cpp
  llvm/test/CodeGen/X86/abdu-vector-128.ll
  llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
  llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
  llvm/test/CodeGen/X86/avg.ll
  llvm/test/CodeGen/X86/avx-basic.ll
  llvm/test/CodeGen/X86/avx-logic.ll
  llvm/test/CodeGen/X86/avx-vbroadcast.ll
  llvm/test/CodeGen/X86/avx-vperm2x128.ll
  llvm/test/CodeGen/X86/avx2-arith.ll
  llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll
  llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
  llvm/test/CodeGen/X86/avx2-shift.ll
  llvm/test/CodeGen/X86/avx2-vbroadcast.ll
  llvm/test/CodeGen/X86/avx2-vector-shifts.ll
  llvm/test/CodeGen/X86/avx512-arith.ll
  llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
  llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
  llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
  llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
  llvm/test/CodeGen/X86/bitcast-vector-bool.ll
  llvm/test/CodeGen/X86/bool-ext-inc.ll
  llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
  llvm/test/CodeGen/X86/cast-vsel.ll
  llvm/test/CodeGen/X86/combine-add.ll
  llvm/test/CodeGen/X86/combine-addo.ll
  llvm/test/CodeGen/X86/combine-and.ll
  llvm/test/CodeGen/X86/combine-bitselect.ll
  llvm/test/CodeGen/X86/combine-concatvectors.ll
  llvm/test/CodeGen/X86/combine-fabs.ll
  llvm/test/CodeGen/X86/combine-fcopysign.ll
  llvm/test/CodeGen/X86/combine-mul.ll
  llvm/test/CodeGen/X86/combine-pavg.ll
  llvm/test/CodeGen/X86/combine-pmuldq.ll
  llvm/test/CodeGen/X86/combine-rotates.ll
  llvm/test/CodeGen/X86/combine-sdiv.ll
  llvm/test/CodeGen/X86/combine-shl.ll
  llvm/test/CodeGen/X86/combine-smax.ll
  llvm/test/CodeGen/X86/combine-smin.ll
  llvm/test/CodeGen/X86/combine-srem.ll
  llvm/test/CodeGen/X86/combine-srl.ll
  llvm/test/CodeGen/X86/combine-sub-usat.ll
  llvm/test/CodeGen/X86/combine-udiv.ll
  llvm/test/CodeGen/X86/combine-urem.ll
  llvm/test/CodeGen/X86/concat-cast.ll
  llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
  llvm/test/CodeGen/X86/exedepsfix-broadcast.ll
  llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll
  llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
  llvm/test/CodeGen/X86/extractelement-fp.ll
  llvm/test/CodeGen/X86/extractelement-from-arg.ll
  llvm/test/CodeGen/X86/extractelement-legalization-cycle.ll
  llvm/test/CodeGen/X86/extractelement-load.ll
  llvm/test/CodeGen/X86/extractelement-shuffle.ll
  llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
  llvm/test/CodeGen/X86/fma_patterns.ll
  llvm/test/CodeGen/X86/fma_patterns_wide.ll
  llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
  llvm/test/CodeGen/X86/fp-round.ll
  llvm/test/CodeGen/X86/freeze-binary.ll
  llvm/test/CodeGen/X86/freeze-vector.ll
  llvm/test/CodeGen/X86/funnel-shift-rot.ll
  llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
  llvm/test/CodeGen/X86/gfni-rotates.ll
  llvm/test/CodeGen/X86/gfni-shifts.ll
  llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
  llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
  llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
  llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
  llvm/test/CodeGen/X86/i64-to-float.ll
  llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
  llvm/test/CodeGen/X86/icmp-pow2-diff.ll
  llvm/test/CodeGen/X86/insert-into-constant-vector.ll
  llvm/test/CodeGen/X86/known-bits-vector.ll
  llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
  llvm/test/CodeGen/X86/masked_load.ll
  llvm/test/CodeGen/X86/masked_store_trunc.ll
  llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
  llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
  llvm/test/CodeGen/X86/memset-nonzero.ll
  llvm/test/CodeGen/X86/merge-store-constants.ll
  llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
  llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
  llvm/test/CodeGen/X86/movmsk-cmp.ll
  llvm/test/CodeGen/X86/oddshuffles.ll
  llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
  llvm/test/CodeGen/X86/opt-pipeline.ll
  llvm/test/CodeGen/X86/packss.ll
  llvm/test/CodeGen/X86/paddus.ll
  llvm/test/CodeGen/X86/pmul.ll
  llvm/test/CodeGen/X86/pmulh.ll
  llvm/test/CodeGen/X86/pr30290.ll
  llvm/test/CodeGen/X86/pr32368.ll
  llvm/test/CodeGen/X86/pr38639.ll
  llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
  llvm/test/CodeGen/X86/psubus.ll
  llvm/test/CodeGen/X86/recip-fastmath.ll
  llvm/test/CodeGen/X86/recip-fastmath2.ll
  llvm/test/CodeGen/X86/sadd_sat_vec.ll
  llvm/test/CodeGen/X86/sar_fold64.ll
  llvm/test/CodeGen/X86/sat-add.ll
  llvm/test/CodeGen/X86/sdiv-exact.ll
  llvm/test/CodeGen/X86/select-of-fp-constants.ll
  llvm/test/CodeGen/X86/setcc-non-simple-type.ll
  llvm/test/CodeGen/X86/shrink_vmul.ll
  llvm/test/CodeGen/X86/shuffle-blendw.ll
  llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
  llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
  llvm/test/CodeGen/X86/slow-pmulld.ll
  llvm/test/CodeGen/X86/splat-const.ll
  llvm/test/CodeGen/X86/splat-for-size.ll
  llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
  llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
  llvm/test/CodeGen/X86/sqrt-fastmath.ll
  llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
  llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
  llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
  llvm/test/CodeGen/X86/sse2.ll
  llvm/test/CodeGen/X86/sshl_sat_vec.ll
  llvm/test/CodeGen/X86/ssub_sat_vec.ll
  llvm/test/CodeGen/X86/subvector-broadcast.ll
  llvm/test/CodeGen/X86/uadd_sat_vec.ll
  llvm/test/CodeGen/X86/umax.ll
  llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
  llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
  llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
  llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
  llvm/test/CodeGen/X86/urem-seteq.ll
  llvm/test/CodeGen/X86/urem-vector-lkk.ll
  llvm/test/CodeGen/X86/usub_sat_vec.ll
  llvm/test/CodeGen/X86/v8i1-masks.ll
  llvm/test/CodeGen/X86/var-permute-128.ll
  llvm/test/CodeGen/X86/var-permute-256.ll
  llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
  llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
  llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
  llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
  llvm/test/CodeGen/X86/vec_fabs.ll
  llvm/test/CodeGen/X86/vec_fp_to_int.ll
  llvm/test/CodeGen/X86/vec_int_to_fp.ll
  llvm/test/CodeGen/X86/vec_minmax_uint.ll
  llvm/test/CodeGen/X86/vec_shift6.ll
  llvm/test/CodeGen/X86/vec_smulo.ll
  llvm/test/CodeGen/X86/vec_uaddo.ll
  llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
  llvm/test/CodeGen/X86/vec_uint_to_fp.ll
  llvm/test/CodeGen/X86/vec_umulo.ll
  llvm/test/CodeGen/X86/vec_usubo.ll
  llvm/test/CodeGen/X86/vector-bitreverse.ll
  llvm/test/CodeGen/X86/vector-blend.ll
  llvm/test/CodeGen/X86/vector-bo-select.ll
  llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
  llvm/test/CodeGen/X86/vector-fshl-128.ll
  llvm/test/CodeGen/X86/vector-fshl-256.ll
  llvm/test/CodeGen/X86/vector-fshl-512.ll
  llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
  llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
  llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
  llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
  llvm/test/CodeGen/X86/vector-fshr-128.ll
  llvm/test/CodeGen/X86/vector-fshr-256.ll
  llvm/test/CodeGen/X86/vector-fshr-512.ll
  llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
  llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
  llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
  llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
  llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
  llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
  llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
  llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
  llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
  llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
  llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
  llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
  llvm/test/CodeGen/X86/vector-lzcnt-512.ll
  llvm/test/CodeGen/X86/vector-mul.ll
  llvm/test/CodeGen/X86/vector-pack-128.ll
  llvm/test/CodeGen/X86/vector-pack-256.ll
  llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
  llvm/test/CodeGen/X86/vector-popcnt-128.ll
  llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
  llvm/test/CodeGen/X86/vector-popcnt-256.ll
  llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
  llvm/test/CodeGen/X86/vector-popcnt-512.ll
  llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
  llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
  llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
  llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
  llvm/test/CodeGen/X86/vector-reduce-umax.ll
  llvm/test/CodeGen/X86/vector-reduce-umin.ll
  llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
  llvm/test/CodeGen/X86/vector-rotate-128.ll
  llvm/test/CodeGen/X86/vector-rotate-256.ll
  llvm/test/CodeGen/X86/vector-rotate-512.ll
  llvm/test/CodeGen/X86/vector-sext.ll
  llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
  llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
  llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
  llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
  llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
  llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
  llvm/test/CodeGen/X86/vector-shift-shl-256.ll
  llvm/test/CodeGen/X86/vector-shift-shl-512.ll
  llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
  llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
  llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
  llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
  llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
  llvm/test/CodeGen/X86/vector-shuffle-combining.ll
  llvm/test/CodeGen/X86/vector-trunc-math.ll
  llvm/test/CodeGen/X86/vector-trunc-packus.ll
  llvm/test/CodeGen/X86/vector-trunc-ssat.ll
  llvm/test/CodeGen/X86/vector-trunc-usat.ll
  llvm/test/CodeGen/X86/vector-trunc.ll
  llvm/test/CodeGen/X86/vector-tzcnt-128.ll
  llvm/test/CodeGen/X86/vector-tzcnt-256.ll
  llvm/test/CodeGen/X86/vector-tzcnt-512.ll
  llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
  llvm/test/CodeGen/X86/vector-zext.ll
  llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
  llvm/test/CodeGen/X86/vselect-avx.ll
  llvm/test/CodeGen/X86/vselect-minmax.ll
  llvm/test/CodeGen/X86/vselect-pcmp.ll
  llvm/test/CodeGen/X86/vselect-post-combine.ll
  llvm/test/CodeGen/X86/vselect-zero.ll
  (5 more files...)



More information about the llvm-commits mailing list