[llvm] [ValueTracking][X86] Compute KnownBits for phadd/phsub (PR #92429)

Sat May 18 23:53:17 PDT 2024

https://github.com/mskamp updated https://github.com/llvm/llvm-project/pull/92429

>From 3caccd86b75ed89cb53001b5d12d2f9750ebd267 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Wed, 15 May 2024 18:05:06 +0200
Subject: [PATCH 1/2] [ValueTracking][X86][NFC] Add Tests for KnownBits of
 phadd/phsub

---
 .../ValueTracking/knownbits-hadd-hsub.ll      | 324 ++++++++++++++++++
 llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll  | 296 ++++++++++++++++
 2 files changed, 620 insertions(+)
 create mode 100644 llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll
 create mode 100644 llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll

diff --git a/llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll b/llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll
new file mode 100644
index 0000000000000..663e9ed01888d
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll
@@ -0,0 +1,324 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define <4 x i1> @hadd_and_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_and_eq_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %andr = and <4 x i32> %hadd, <i32 -8, i32 -8, i32 -8, i32 -8>
+  %ret = icmp eq <4 x i32> %andr, <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i1> %ret
+}
+
+define <8 x i1> @hadd_and_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2)
+  %andr = and <8 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <8 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hadd_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16_sat(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %and1, <8 x i16> %and2)
+  %andr = and <8 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <8 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hadd_and_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i32(
+; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6, i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %andr = and <8 x i32> %hadd, <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
+  %ret = icmp eq <8 x i32> %andr, <i32 3, i32 4, i32 5, i32 6, i32 3, i32 4, i32 5, i32 6>
+  ret <8 x i1> %ret
+}
+
+define <16 x i1> @hadd_and_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2)
+  %andr = and <16 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <16 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <16 x i1> %ret
+}
+
+define <16 x i1> @hadd_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16_sat(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %and1, <16 x i16> %and2)
+  %andr = and <16 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <16 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <16 x i1> %ret
+}
+
+define <4 x i1> @hsub_trunc_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hsub_trunc_eq_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2)
+  %conv = trunc <4 x i32> %hsub to <4 x i16>
+  %ret = icmp eq <4 x i16> %conv, <i16 3, i16 4, i16 5, i16 6>
+  ret <4 x i1> %ret
+}
+
+define <8 x i1> @hsub_trunc_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i16(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <8 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2)
+  %conv = trunc <8 x i16> %hsub to <8 x i8>
+  %ret = icmp eq <8 x i8> %conv, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hsub_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hsub_and_eq_v8i16_sat(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR1:%.*]] = or <8 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[OR2:%.*]] = or <8 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[AND1:%.*]] = and <8 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[AND2:%.*]] = and <8 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[HSUB:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> [[AND1]], <8 x i16> [[AND2]])
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt <8 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %or1 = or <8 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %or2 = or <8 x i16> %y, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %and1 = and <8 x i16> %or1, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %and2 = and <8 x i16> %or2, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %and1, <8 x i16> %and2)
+  %ret = icmp sle <8 x i16> %hsub, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hsub_trunc_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i32(
+; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6, i16 3, i16 4, i16 5, i16 6>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <8 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2)
+  %conv = trunc <8 x i32> %hsub to <8 x i16>
+  %ret = icmp eq <8 x i16> %conv, <i16 3, i16 4, i16 5, i16 6, i16 3, i16 4, i16 5, i16 6>
+  ret <8 x i1> %ret
+}
+
+define <16 x i1> @hsub_trunc_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hsub_trunc_eq_v16i16(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <16 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <16 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2)
+  %conv = trunc <16 x i16> %hsub to <16 x i8>
+  %ret = icmp eq <16 x i8> %conv, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+  ret <16 x i1> %ret
+}
+
+define <16 x i1> @hsub_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hsub_and_eq_v16i16_sat(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR1:%.*]] = or <16 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[OR2:%.*]] = or <16 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[AND1:%.*]] = and <16 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[AND2:%.*]] = and <16 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[HSUB:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[AND1]], <16 x i16> [[AND2]])
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt <16 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %or1 = or <16 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %or2 = or <16 x i16> %y, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %and1 = and <16 x i16> %or1, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %and2 = and <16 x i16> %or2, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %and1, <16 x i16> %and2)
+  %ret = icmp sle <16 x i16> %hsub, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_2st_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 -1, i32 -1, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[Y]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_4th_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[Y]], <i32 -1, i32 -1, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[X]], <4 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %y, <i32 -1, i32 -1, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_2st_negative_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_negative_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_4th_negative_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_negative_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 -1, i32 -1>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
new file mode 100644
index 0000000000000..ac6895fa5a8b9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
+
+define <4 x i16> @hadd_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hadd_trunc_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %conv = trunc <4 x i32> %hadd to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: hadd_trunc_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2)
+  %conv = trunc <8 x i16> %hadd to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i8> @hadd_trunc_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: hadd_trunc_v8i16_sat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %and1, <8 x i16> %and2)
+  %conv = trunc <8 x i16> %hadd to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hadd_trunc_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %conv = trunc <8 x i32> %hadd to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: hadd_trunc_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2)
+  %conv = trunc <16 x i16> %hadd to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i8> @hadd_trunc_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: hadd_trunc_v16i16_sat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %and1, <16 x i16> %and2)
+  %conv = trunc <16 x i16> %hadd to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <4 x i16> @hsub_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hsub_trunc_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2)
+  %conv = trunc <4 x i32> %hsub to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: hsub_trunc_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <8 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2)
+  %conv = trunc <8 x i16> %hsub to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i8> @hsub_trunc_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: hsub_trunc_v8i16_sat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,0,3,0,3,0,3,0]
+; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,3,7,3,7,3,7,3]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <8 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %or2 = or <8 x i16> %y, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %and1 = and <8 x i16> %or1, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %and2 = and <8 x i16> %or2, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %and1, <8 x i16> %and2)
+  %conv = trunc <8 x i16> %hsub to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hsub_trunc_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <8 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2)
+  %conv = trunc <8 x i32> %hsub to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: hsub_trunc_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <16 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2)
+  %conv = trunc <16 x i16> %hsub to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i8> @hsub_trunc_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: hsub_trunc_v16i16_sat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0]
+; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [7,3,7,3,7,3,7,3,7,3,7,3,7,3,7,3]
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <16 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %or2 = or <16 x i16> %y, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %and1 = and <16 x i16> %or1, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %and2 = and <16 x i16> %or2, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %and1, <16 x i16> %and2)
+  %conv = trunc <16 x i16> %hsub to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <4 x i16> @hadd_extract_2st_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hadd_extract_2st_trunc_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %andr = and <4 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0>
+  %conv = trunc <4 x i32> %andr to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <4 x i16> @hadd_extract_4th_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hadd_extract_4th_trunc_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[12,13],zero,zero,xmm0[12,13,12,13,14,15]
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %y, <i32 -1, i32 -1, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %andr = and <4 x i32> %hadd, <i32 0, i32 0, i32 0, i32 -1>
+  %conv = trunc <4 x i32> %andr to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <4 x i16> @hadd_extract_2st_trunc_redundant_and_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hadd_extract_2st_trunc_redundant_and_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %andr = and <4 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0>
+  %conv = trunc <4 x i32> %andr to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <4 x i16> @hadd_extract_4th_trunc_redundant_and_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hadd_extract_4th_trunc_redundant_and_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[12,13],zero,zero,xmm0[12,13,12,13,14,15]
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 -1, i32 -1>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %andr = and <4 x i32> %hadd, <i32 0, i32 0, i32 0, i32 -1>
+  %conv = trunc <4 x i32> %andr to <4 x i16>
+  ret <4 x i16> %conv
+}

>From 58d73b1024ae3390b368b97a5ec34a36126399bd Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Thu, 16 May 2024 17:04:55 +0200
Subject: [PATCH 2/2] [ValueTracking][X86] Compute KnownBits for phadd/phsub

Add KnownBits computations to ValueTracking and X86 DAG lowering.

These instructions add/subtract adjacent vector elements in their
operands. Example: phadd [X1, X2] [Y1, Y2] = [X1 + X2, Y1 + Y2].
This means that, in this example, we can compute the KnownBits of the
operation by computing the KnownBits of [X1, X2] + [X1, X2] and
[Y1, Y2] + [Y1, Y2] and intersecting the results. This approach
also generalizes to all x86 vector types.

There are also the operations phadd.sw and phsub.sw, which perform
saturating addition/subtraction. Use sadd_sat and ssub_sat to compute
the KnownBits of these operations.

Also adjust the existing test case pr53247.ll because it can be
transformed to a constant using the new KnownBits computation.

Fixes #82516.
---
 llvm/include/llvm/Analysis/VectorUtils.h      | 14 +++
 llvm/lib/Analysis/ValueTracking.cpp           | 85 ++++++++++++++++
 llvm/lib/Analysis/VectorUtils.cpp             | 20 ++++
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 95 ++++++++++++++++++
 .../ValueTracking/knownbits-hadd-hsub.ll      | 98 +++----------------
 llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll  | 54 +++-------
 llvm/test/CodeGen/X86/pr53247.ll              | 10 +-
 llvm/unittests/Analysis/VectorUtilsTest.cpp   | 20 ++++
 8 files changed, 266 insertions(+), 130 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 521dac08792f5..9c6da6c874019 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -246,6 +246,20 @@ void processShuffleMasks(
     function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
     function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
 
+/// Compute the demanded elements mask of horizontal binary operations. A
+/// horizontal operation combines two adjacent elements in a vector operand.
+/// This function returns a mask for the elements that correspond to the first
+/// operand of this horizontal combination. For example, for two vectors
+/// [X1, X2, X3, X4] and [Y1, Y2, Y3, Y4], the resulting mask can include the
+/// elements X1, X3, Y1, and Y3. To get the other operands, simply shift the
+/// result of this function to the left by 1.
+///
+/// \param DemandedEltsOp  the demanded elements mask for the operation
+/// \param DemandedEltsLHS the demanded elements mask for the left operand
+/// \param DemandedEltsRHS the demanded elements mask for the right operand
+void getHorizontalDemandedElts(const APInt &DemandedEltsOp,
+                               APInt &DemandedEltsLHS, APInt &DemandedEltsRHS);
+
 /// Compute a map of integer instructions to their minimum legal type
 /// size.
 ///
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2fdbb6e3ef840..35058e30d8849 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -950,6 +950,41 @@ getKnownBitsFromAndXorOr(const Operator *I, const APInt &DemandedElts,
   return KnownOut;
 }
 
+static KnownBits computeKnownBitsForHorizontalOperation(
+    const Operator *I, const APInt &DemandedElts, unsigned Depth,
+    const SimplifyQuery &Q,
+    const std::function<KnownBits(const KnownBits &, const KnownBits &)>
+        KnownBitsFunc) {
+  APInt DemandedEltsLHS, DemandedEltsRHS;
+  getHorizontalDemandedElts(DemandedElts, DemandedEltsLHS, DemandedEltsRHS);
+
+  std::array<KnownBits, 2> KnownLHS;
+  for (unsigned Index = 0; Index < KnownLHS.size(); ++Index) {
+    if (!DemandedEltsLHS.isZero()) {
+      KnownLHS[Index] =
+          computeKnownBits(I->getOperand(0), DemandedEltsLHS, Depth + 1, Q);
+    } else {
+      KnownLHS[Index] = KnownBits(I->getType()->getScalarSizeInBits());
+      KnownLHS[Index].setAllZero();
+    }
+    DemandedEltsLHS <<= 1;
+  }
+  std::array<KnownBits, 2> KnownRHS;
+  for (unsigned Index = 0; Index < KnownRHS.size(); ++Index) {
+    if (!DemandedEltsRHS.isZero()) {
+      KnownRHS[Index] =
+          computeKnownBits(I->getOperand(1), DemandedEltsRHS, Depth + 1, Q);
+    } else {
+      KnownRHS[Index] = KnownBits(I->getType()->getScalarSizeInBits());
+      KnownRHS[Index].setAllZero();
+    }
+    DemandedEltsRHS <<= 1;
+  }
+
+  return KnownBitsFunc(KnownLHS[0], KnownLHS[1])
+      .intersectWith(KnownBitsFunc(KnownRHS[0], KnownRHS[1]));
+}
+
 // Public so this can be used in `SimplifyDemandedUseBits`.
 KnownBits llvm::analyzeKnownBitsFromAndXorOr(const Operator *I,
                                              const KnownBits &KnownLHS,
@@ -1725,6 +1760,56 @@ static void computeKnownBitsFromOperator(const Operator *I,
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
         break;
+      case Intrinsic::x86_ssse3_phadd_d:
+      case Intrinsic::x86_ssse3_phadd_w:
+      case Intrinsic::x86_ssse3_phadd_d_128:
+      case Intrinsic::x86_ssse3_phadd_w_128:
+      case Intrinsic::x86_avx2_phadd_d:
+      case Intrinsic::x86_avx2_phadd_w: {
+        Known = computeKnownBitsForHorizontalOperation(
+            I, DemandedElts, Depth, Q,
+            [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+              return KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
+                                                 /*NUW=*/false, KnownLHS,
+                                                 KnownRHS);
+            });
+        break;
+      }
+      case Intrinsic::x86_ssse3_phadd_sw:
+      case Intrinsic::x86_ssse3_phadd_sw_128:
+      case Intrinsic::x86_avx2_phadd_sw: {
+        Known = computeKnownBitsForHorizontalOperation(
+            I, DemandedElts, Depth, Q,
+            [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+              return KnownBits::sadd_sat(KnownLHS, KnownRHS);
+            });
+        break;
+      }
+      case Intrinsic::x86_ssse3_phsub_d:
+      case Intrinsic::x86_ssse3_phsub_w:
+      case Intrinsic::x86_ssse3_phsub_d_128:
+      case Intrinsic::x86_ssse3_phsub_w_128:
+      case Intrinsic::x86_avx2_phsub_d:
+      case Intrinsic::x86_avx2_phsub_w: {
+        Known = computeKnownBitsForHorizontalOperation(
+            I, DemandedElts, Depth, Q,
+            [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+              return KnownBits::computeForAddSub(/*Add=*/false, /*NSW=*/false,
+                                                 /*NUW=*/false, KnownLHS,
+                                                 KnownRHS);
+            });
+        break;
+      }
+      case Intrinsic::x86_ssse3_phsub_sw:
+      case Intrinsic::x86_ssse3_phsub_sw_128:
+      case Intrinsic::x86_avx2_phsub_sw: {
+        Known = computeKnownBitsForHorizontalOperation(
+            I, DemandedElts, Depth, Q,
+            [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+              return KnownBits::ssub_sat(KnownLHS, KnownRHS);
+            });
+        break;
+      }
       case Intrinsic::riscv_vsetvli:
       case Intrinsic::riscv_vsetvlimax: {
         bool HasAVL = II->getIntrinsicID() == Intrinsic::riscv_vsetvli;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 917094267d05a..2b94fdea30923 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -541,6 +541,26 @@ void llvm::processShuffleMasks(
   }
 }
 
+void llvm::getHorizontalDemandedElts(const APInt &DemandedEltsOp,
+                                     APInt &DemandedEltsLHS,
+                                     APInt &DemandedEltsRHS) {
+  DemandedEltsLHS = DemandedEltsRHS =
+      APInt::getZero(DemandedEltsOp.getBitWidth());
+
+  unsigned Index = 0;
+  const auto HalfBitWidth = DemandedEltsOp.getBitWidth() / 2;
+  for (; Index < HalfBitWidth; ++Index) {
+    if (DemandedEltsOp[Index]) {
+      DemandedEltsLHS.setBit(2 * Index);
+    }
+  }
+  for (; Index < DemandedEltsOp.getBitWidth(); ++Index) {
+    if (DemandedEltsOp[Index]) {
+      DemandedEltsRHS.setBit(2 * (Index - HalfBitWidth));
+    }
+  }
+}
+
 MapVector<Instruction *, uint64_t>
 llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
                                const TargetTransformInfo *TTI) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ecc5b3b3bf840..fdbfcb489ac4e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36953,6 +36953,41 @@ static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
   Known = Known.zext(64);
 }
 
+static KnownBits computeKnownBitsForHorizontalOperation(
+    const SDValue Op, const APInt &DemandedElts, unsigned Depth,
+    unsigned OpIndexStart, const SelectionDAG &DAG,
+    const std::function<KnownBits(const KnownBits &, const KnownBits &)>
+        KnownBitsFunc) {
+  APInt DemandedEltsLHS, DemandedEltsRHS;
+  getHorizontalDemandedElts(DemandedElts, DemandedEltsLHS, DemandedEltsRHS);
+
+  std::array<KnownBits, 2> KnownLHS;
+  for (unsigned Index = 0; Index < KnownLHS.size(); ++Index) {
+    if (!DemandedEltsLHS.isZero()) {
+      KnownLHS[Index] = DAG.computeKnownBits(Op.getOperand(OpIndexStart),
+                                             DemandedEltsLHS, Depth + 1);
+    } else {
+      KnownLHS[Index] = KnownBits(Op.getScalarValueSizeInBits());
+      KnownLHS[Index].setAllZero();
+    }
+    DemandedEltsLHS <<= 1;
+  }
+  std::array<KnownBits, 2> KnownRHS;
+  for (unsigned Index = 0; Index < KnownRHS.size(); ++Index) {
+    if (!DemandedEltsRHS.isZero()) {
+      KnownRHS[Index] = DAG.computeKnownBits(Op.getOperand(OpIndexStart + 1),
+                                             DemandedEltsRHS, Depth + 1);
+    } else {
+      KnownRHS[Index] = KnownBits(Op.getScalarValueSizeInBits());
+      KnownRHS[Index].setAllZero();
+    }
+    DemandedEltsRHS <<= 1;
+  }
+
+  return KnownBitsFunc(KnownLHS[0], KnownLHS[1])
+      .intersectWith(KnownBitsFunc(KnownRHS[0], KnownRHS[1]));
+}
+
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
@@ -37262,6 +37297,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::HADD:
+  case X86ISD::HSUB: {
+    Known = computeKnownBitsForHorizontalOperation(
+        Op, DemandedElts, Depth, /*OpIndexStart=*/0, DAG,
+        [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+          return KnownBits::computeForAddSub(
+              /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
+              KnownLHS, KnownRHS);
+        });
+    break;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     switch (Op->getConstantOperandVal(0)) {
     case Intrinsic::x86_sse2_psad_bw:
@@ -37276,6 +37322,55 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
       break;
     }
+    case Intrinsic::x86_ssse3_phadd_d:
+    case Intrinsic::x86_ssse3_phadd_w:
+    case Intrinsic::x86_ssse3_phadd_d_128:
+    case Intrinsic::x86_ssse3_phadd_w_128:
+    case Intrinsic::x86_avx2_phadd_d:
+    case Intrinsic::x86_avx2_phadd_w: {
+      Known = computeKnownBitsForHorizontalOperation(
+          Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
+          [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+            return KnownBits::computeForAddSub(
+                /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KnownLHS, KnownRHS);
+          });
+      break;
+    }
+    case Intrinsic::x86_ssse3_phadd_sw:
+    case Intrinsic::x86_ssse3_phadd_sw_128:
+    case Intrinsic::x86_avx2_phadd_sw: {
+      Known = computeKnownBitsForHorizontalOperation(
+          Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
+          [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+            return KnownBits::sadd_sat(KnownLHS, KnownRHS);
+          });
+      break;
+    }
+    case Intrinsic::x86_ssse3_phsub_d:
+    case Intrinsic::x86_ssse3_phsub_w:
+    case Intrinsic::x86_ssse3_phsub_d_128:
+    case Intrinsic::x86_ssse3_phsub_w_128:
+    case Intrinsic::x86_avx2_phsub_d:
+    case Intrinsic::x86_avx2_phsub_w: {
+      Known = computeKnownBitsForHorizontalOperation(
+          Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
+          [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+            return KnownBits::computeForAddSub(/*Add=*/false, /*NSW=*/false,
+                                               /*NUW=*/false, KnownLHS,
+                                               KnownRHS);
+          });
+      break;
+    }
+    case Intrinsic::x86_ssse3_phsub_sw:
+    case Intrinsic::x86_ssse3_phsub_sw_128:
+    case Intrinsic::x86_avx2_phsub_sw: {
+      Known = computeKnownBitsForHorizontalOperation(
+          Op, DemandedElts, Depth, /*OpIndexStart=*/1, DAG,
+          [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+            return KnownBits::ssub_sat(KnownLHS, KnownRHS);
+          });
+      break;
+    }
     }
     break;
   }
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll b/llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll
index 663e9ed01888d..e2fe873d715cd 100644
--- a/llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll
+++ b/llvm/test/Analysis/ValueTracking/knownbits-hadd-hsub.ll
@@ -5,12 +5,7 @@ define <4 x i1> @hadd_and_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hadd_and_eq_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> zeroinitializer
 ;
 entry:
   %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -25,12 +20,7 @@ define <8 x i1> @hadd_and_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -45,12 +35,7 @@ define <8 x i1> @hadd_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16_sat(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -65,12 +50,7 @@ define <8 x i1> @hadd_and_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i32(
 ; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6, i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> zeroinitializer
 ;
 entry:
   %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -85,12 +65,7 @@ define <16 x i1> @hadd_and_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -105,12 +80,7 @@ define <16 x i1> @hadd_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16_sat(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -125,12 +95,7 @@ define <4 x i1> @hsub_trunc_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hsub_trunc_eq_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> zeroinitializer
 ;
 entry:
   %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -145,12 +110,7 @@ define <8 x i1> @hsub_trunc_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i16(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -165,13 +125,7 @@ define <8 x i1> @hsub_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hsub_and_eq_v8i16_sat(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR1:%.*]] = or <8 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[OR2:%.*]] = or <8 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[AND1:%.*]] = and <8 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[AND2:%.*]] = and <8 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[HSUB:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> [[AND1]], <8 x i16> [[AND2]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp slt <8 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %or1 = or <8 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
@@ -187,12 +141,7 @@ define <8 x i1> @hsub_trunc_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i32(
 ; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6, i16 3, i16 4, i16 5, i16 6>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> zeroinitializer
 ;
 entry:
   %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -207,12 +156,7 @@ define <16 x i1> @hsub_trunc_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hsub_trunc_eq_v16i16(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <16 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -227,13 +171,7 @@ define <16 x i1> @hsub_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hsub_and_eq_v16i16_sat(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR1:%.*]] = or <16 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[OR2:%.*]] = or <16 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[AND1:%.*]] = and <16 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[AND2:%.*]] = and <16 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[HSUB:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[AND1]], <16 x i16> [[AND2]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp slt <16 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %or1 = or <16 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
@@ -249,11 +187,7 @@ define <4 x i1> @hadd_shuffle_2st_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 -1, i32 -1, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[Y]])
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3>
@@ -268,11 +202,7 @@ define <4 x i1> @hadd_shuffle_4th_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[Y]], <i32 -1, i32 -1, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[X]], <4 x i32> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
index ac6895fa5a8b9..0f7c3d9b0c172 100644
--- a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
+++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
@@ -8,7 +8,7 @@ define <4 x i16> @hadd_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -25,7 +25,7 @@ define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -42,7 +42,7 @@ define <8 x i8> @hadd_trunc_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -59,9 +59,8 @@ define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -79,7 +78,6 @@ define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -99,7 +97,6 @@ define <16 x i8> @hadd_trunc_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -115,11 +112,7 @@ entry:
 define <4 x i16> @hsub_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: hsub_trunc_v4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -132,11 +125,7 @@ entry:
 define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: hsub_trunc_v8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -156,7 +145,7 @@ define <8 x i8> @hsub_trunc_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <8 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
@@ -171,14 +160,7 @@ entry:
 define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: hsub_trunc_v8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -191,14 +173,7 @@ entry:
 define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: hsub_trunc_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -218,7 +193,6 @@ define <16 x i8> @hsub_trunc_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -238,7 +212,9 @@ define <4 x i16> @hadd_extract_2st_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; CHECK-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3>
@@ -254,7 +230,9 @@ define <4 x i16> @hadd_extract_4th_trunc_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
 ; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[12,13],zero,zero,xmm0[12,13,12,13,14,15]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/X86/pr53247.ll b/llvm/test/CodeGen/X86/pr53247.ll
index 2fc2ffb414e0e..cb5e699c8da5e 100644
--- a/llvm/test/CodeGen/X86/pr53247.ll
+++ b/llvm/test/CodeGen/X86/pr53247.ll
@@ -5,18 +5,12 @@
 define i32 @PR53247(){
 ; SSE-LABEL: PR53247:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pxor %xmm0, %xmm0
-; SSE-NEXT:    phaddd %xmm0, %xmm0
-; SSE-NEXT:    phaddd %xmm0, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR53247:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    retq
 entry:
   %0 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer)
diff --git a/llvm/unittests/Analysis/VectorUtilsTest.cpp b/llvm/unittests/Analysis/VectorUtilsTest.cpp
index 14958aa646a04..770a863be94fe 100644
--- a/llvm/unittests/Analysis/VectorUtilsTest.cpp
+++ b/llvm/unittests/Analysis/VectorUtilsTest.cpp
@@ -242,6 +242,26 @@ TEST_F(BasicTest, getShuffleDemandedElts) {
   EXPECT_EQ(RHS.getZExtValue(), 0x9U);
 }
 
+TEST_F(BasicTest, getHorizontalDemandedElts) {
+  APInt LHS, RHS;
+
+  getHorizontalDemandedElts(APInt(4, 0b0000), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0000U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0000U);
+
+  getHorizontalDemandedElts(APInt(4, 0b0001), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0001U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0000U);
+
+  getHorizontalDemandedElts(APInt(4, 0b1000), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0000U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0100U);
+
+  getHorizontalDemandedElts(APInt(4, 0b0110), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0100U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0001U);
+}
+
 TEST_F(BasicTest, getSplatIndex) {
   EXPECT_EQ(getSplatIndex({0,0,0}), 0);
   EXPECT_EQ(getSplatIndex({1,0,0}), -1);     // no splat