Discovered by JP Maaninen

#include <immintrin.h>

__m256i Slow(__m256i a, __m256i b) {
  __m256i c = _mm256_add_epi16(a, _mm256_slli_epi16(a, 1)); 
  return _mm256_add_epi16(c, b);

       vpmullw .LCPI0_0(%rip), %ymm0, %ymm0
        vpaddw  %ymm1, %ymm0, %ymm0

__m256i Fast(__m256i a, __m256i b) {
  __m256i c = _mm256_add_epi16(b, _mm256_slli_epi16(a, 1)); 
  return _mm256_add_epi16(c, a);

        vpaddw  %ymm0, %ymm0, %ymm2
        vpaddw  %ymm0, %ymm1, %ymm0
        vpaddw  %ymm2, %ymm0, %ymm0


This is either instcombine's problem or the backend's lack of optimization.

define dso_local <4 x i64> @_Z4SlowDv4_xS_(<4 x i64> %a, <4 x i64> %b)
local_unnamed_addr #0 {
  %0 = bitcast <4 x i64> %a to <16 x i16>
  %1 = shl <16 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16
1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  %add.i5 = add <16 x i16> %1, %0
  %2 = bitcast <16 x i16> %add.i5 to <4 x i64>
  %3 = bitcast <4 x i64> %b to <16 x i16>
  %add.i = add <16 x i16> %3, %add.i5
  %4 = bitcast <16 x i16> %add.i to <4 x i64>
  ret <4 x i64> %4

attributes #0 = { norecurse nounwind readnone uwtable
"disable-tail-calls"="false" "frame-pointer"="none"
"less-precise-fpmad"="false" "min-legal-vector-width"="256"
"no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="haswell"
"unsafe-fp-math"="false" "use-soft-float"="false" }

opt -passes=instcombine -S generates

  %add.i5 = mul <16 x i16> %0, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16
3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>

which will lower to VPMULLWYrm in X86ISelDAGToDAG and sticks after every
codegen pass.

