[llvm] [ValueTracking][X86] Compute KnownBits for phadd/phsub (PR #92429)

via llvm-commits llvm-commits at lists.llvm.org
Sat Jun 15 06:19:08 PDT 2024


https://github.com/mskamp updated https://github.com/llvm/llvm-project/pull/92429

>From 95724e9c88e8ea4a9398ba519d8709fac9500493 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Wed, 15 May 2024 18:05:06 +0200
Subject: [PATCH 1/2] [ValueTracking][X86][NFC] Add Tests for KnownBits of
 phadd/phsub

---
 .../ValueTracking/knownbits-x86-hadd-hsub.ll  | 324 ++++++++++++++++++
 llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll  | 235 +++++++++++++
 2 files changed, 559 insertions(+)
 create mode 100644 llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll
 create mode 100644 llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll

diff --git a/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll
new file mode 100644
index 0000000000000..663e9ed01888d
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll
@@ -0,0 +1,324 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define <4 x i1> @hadd_and_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_and_eq_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %andr = and <4 x i32> %hadd, <i32 -8, i32 -8, i32 -8, i32 -8>
+  %ret = icmp eq <4 x i32> %andr, <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i1> %ret
+}
+
+define <8 x i1> @hadd_and_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2)
+  %andr = and <8 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <8 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hadd_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16_sat(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %and1, <8 x i16> %and2)
+  %andr = and <8 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <8 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hadd_and_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i32(
+; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6, i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %andr = and <8 x i32> %hadd, <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
+  %ret = icmp eq <8 x i32> %andr, <i32 3, i32 4, i32 5, i32 6, i32 3, i32 4, i32 5, i32 6>
+  ret <8 x i1> %ret
+}
+
+define <16 x i1> @hadd_and_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2)
+  %andr = and <16 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <16 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <16 x i1> %ret
+}
+
+define <16 x i1> @hadd_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16_sat(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %and1, <16 x i16> %and2)
+  %andr = and <16 x i16> %hadd, <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
+  %ret = icmp eq <16 x i16> %andr, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
+  ret <16 x i1> %ret
+}
+
+define <4 x i1> @hsub_trunc_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hsub_trunc_eq_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2)
+  %conv = trunc <4 x i32> %hsub to <4 x i16>
+  %ret = icmp eq <4 x i16> %conv, <i16 3, i16 4, i16 5, i16 6>
+  ret <4 x i1> %ret
+}
+
+define <8 x i1> @hsub_trunc_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i16(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <8 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2)
+  %conv = trunc <8 x i16> %hsub to <8 x i8>
+  %ret = icmp eq <8 x i8> %conv, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hsub_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <8 x i1> @hsub_and_eq_v8i16_sat(
+; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR1:%.*]] = or <8 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[OR2:%.*]] = or <8 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[AND1:%.*]] = and <8 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[AND2:%.*]] = and <8 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[HSUB:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> [[AND1]], <8 x i16> [[AND2]])
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt <8 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %or1 = or <8 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %or2 = or <8 x i16> %y, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %and1 = and <8 x i16> %or1, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %and2 = and <8 x i16> %or2, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %and1, <8 x i16> %and2)
+  %ret = icmp sle <8 x i16> %hsub, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @hsub_trunc_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i32(
+; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6, i16 3, i16 4, i16 5, i16 6>
+; CHECK-NEXT:    ret <8 x i1> [[RET]]
+;
+entry:
+  %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <8 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2)
+  %conv = trunc <8 x i32> %hsub to <8 x i16>
+  %ret = icmp eq <8 x i16> %conv, <i16 3, i16 4, i16 5, i16 6, i16 3, i16 4, i16 5, i16 6>
+  ret <8 x i1> %ret
+}
+
+define <16 x i1> @hsub_trunc_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hsub_trunc_eq_v16i16(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or <16 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
+; CHECK-NEXT:    [[CONV:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <16 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2)
+  %conv = trunc <16 x i16> %hsub to <16 x i8>
+  %ret = icmp eq <16 x i8> %conv, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
+  ret <16 x i1> %ret
+}
+
+define <16 x i1> @hsub_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: define <16 x i1> @hsub_and_eq_v16i16_sat(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR1:%.*]] = or <16 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[OR2:%.*]] = or <16 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+; CHECK-NEXT:    [[AND1:%.*]] = and <16 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[AND2:%.*]] = and <16 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+; CHECK-NEXT:    [[HSUB:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[AND1]], <16 x i16> [[AND2]])
+; CHECK-NEXT:    [[RET:%.*]] = icmp slt <16 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    ret <16 x i1> [[RET]]
+;
+entry:
+  %or1 = or <16 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %or2 = or <16 x i16> %y, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
+  %and1 = and <16 x i16> %or1, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %and2 = and <16 x i16> %or2, <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
+  %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %and1, <16 x i16> %and2)
+  %ret = icmp sle <16 x i16> %hsub, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_2st_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 -1, i32 -1, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[Y]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_4th_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[Y]], <i32 -1, i32 -1, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[X]], <4 x i32> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %y, <i32 -1, i32 -1, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_2st_negative_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_negative_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @hadd_shuffle_4th_negative_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_negative_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 -1, i32 -1>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  %ret = icmp ne <4 x i32> %shuf, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i1> %ret
+}
diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
new file mode 100644
index 0000000000000..1056919488ba1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
+
+define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hadd_select_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
+; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1
+; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
+  %cond = icmp ule <4 x i32> %hadd, <i32 8, i32 8, i32 8, i32 8>
+  %ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hadd
+  ret <4 x i32> %ret
+}
+
+define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: hadd_trunc_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2)
+  %conv = trunc <8 x i16> %hadd to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hadd_trunc_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %conv = trunc <8 x i32> %hadd to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: hadd_trunc_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2)
+  %conv = trunc <16 x i16> %hadd to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: hsub_select_shl_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpslld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9]
+; CHECK-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2)
+  %shl = shl <4 x i32> %hsub, <i32 16, i32 16, i32 16, i32 16>
+  %cond = icmp ule <4 x i32> %shl, <i32 8, i32 8, i32 8, i32 8>
+  %ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hsub
+  ret <4 x i32> %ret
+}
+
+define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: hsub_trunc_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <8 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2)
+  %conv = trunc <8 x i16> %hsub to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hsub_trunc_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %or2 = or <8 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2)
+  %conv = trunc <8 x i32> %hsub to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: hsub_trunc_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %or2 = or <16 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2)
+  %conv = trunc <16 x i16> %hsub to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <8 x i16> @hadd_extract_2st_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hadd_extract_2st_trunc_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3, i32 -1, i32 -1, i32 -1, i32 -1>
+  %and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %conv = trunc <8 x i32> %andr to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @hadd_extract_8th_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hadd_extract_8th_trunc_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 3, i32 3>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
+  %conv = trunc <8 x i32> %andr to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @hadd_extract_2st_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hadd_extract_2st_trunc_redundant_and_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 -1, i32 -1, i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %conv = trunc <8 x i32> %andr to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @hadd_extract_4th_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: hadd_extract_4th_trunc_redundant_and_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 -1, i32 -1>
+  %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
+  %andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
+  %conv = trunc <8 x i32> %andr to <8 x i16>
+  ret <8 x i16> %conv
+}

>From e8ce12522f67c32a26ae51a17e233ef1973f531b Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Thu, 16 May 2024 17:04:55 +0200
Subject: [PATCH 2/2] [ValueTracking][X86] Compute KnownBits for phadd/phsub

Add KnownBits computations to ValueTracking and X86 DAG lowering.

These instructions add/subtract adjacent vector elements in their
operands. Example: phadd [X1, X2] [Y1, Y2] = [X1 + X2, Y1 + Y2].
This means that, in this example, we can compute the KnownBits of the
operation by computing the KnownBits of [X1, X2] + [X1, X2] and
[Y1, Y2] + [Y1, Y2] and intersecting the results. This approach
also generalizes to all x86 vector types.

There are also the operations phadd.sw and phsub.sw, which perform
saturating addition/subtraction. Use sadd_sat and ssub_sat to compute
the KnownBits of these operations in ValueTracking.

Also adjust the existing test case pr53247.ll because it can be
transformed to a constant using the new KnownBits computation.

Fixes #82516.
---
 llvm/include/llvm/Analysis/VectorUtils.h      | 17 ++++
 llvm/lib/Analysis/ValueTracking.cpp           | 65 ++++++++++++
 llvm/lib/Analysis/VectorUtils.cpp             | 28 ++++++
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 66 ++++++++-----
 .../ValueTracking/knownbits-x86-hadd-hsub.ll  | 98 +++----------------
 llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll  | 51 ++--------
 llvm/test/CodeGen/X86/pr53247.ll              | 10 +-
 llvm/unittests/Analysis/VectorUtilsTest.cpp   | 24 +++++
 8 files changed, 201 insertions(+), 158 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 521dac08792f5..88965df7462db 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -246,6 +246,23 @@ void processShuffleMasks(
     function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
     function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
 
+/// Compute the demanded elements mask of horizontal binary operations. A
+/// horizontal operation combines two adjacent elements in a vector operand.
+/// This function returns a mask for the elements that correspond to the first
+/// operand of this horizontal combination. For example, for two vectors
+/// [X1, X2, X3, X4] and [Y1, Y2, Y3, Y4], the resulting mask can include the
+/// elements X1, X3, Y1, and Y3. To get the other operands, simply shift the
+/// result of this function to the left by 1.
+///
+/// \param VectorBitWidth the total bit width of the vector
+/// \param DemandedElts   the demanded elements mask for the operation
+/// \param DemandedLHS    the demanded elements mask for the left operand
+/// \param DemandedRHS    the demanded elements mask for the right operand
+void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth,
+                                         const APInt &DemandedElts,
+                                         APInt &DemandedLHS,
+                                         APInt &DemandedRHS);
+
 /// Compute a map of integer instructions to their minimum legal type
 /// size.
 ///
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2fdbb6e3ef840..2a6a593985aa0 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -950,6 +950,33 @@ getKnownBitsFromAndXorOr(const Operator *I, const APInt &DemandedElts,
   return KnownOut;
 }
 
+static KnownBits computeKnownBitsForHorizontalOperation(
+    const Operator *I, const APInt &DemandedElts, unsigned Depth,
+    const SimplifyQuery &Q,
+    const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
+        KnownBitsFunc) {
+  APInt DemandedEltsLHS, DemandedEltsRHS;
+  getHorizDemandedEltsForFirstOperand(Q.DL.getTypeSizeInBits(I->getType()),
+                                      DemandedElts, DemandedEltsLHS,
+                                      DemandedEltsRHS);
+
+  const auto ComputeForSingleOpFunc =
+      [Depth, &Q, KnownBitsFunc](const Value *Op, APInt &DemandedEltsOp) {
+        return KnownBitsFunc(
+            computeKnownBits(Op, DemandedEltsOp, Depth + 1, Q),
+            computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1, Q));
+      };
+
+  if (!DemandedEltsLHS.isZero() && !DemandedEltsRHS.isZero()) {
+    return ComputeForSingleOpFunc(I->getOperand(0), DemandedEltsLHS)
+        .intersectWith(ComputeForSingleOpFunc(I->getOperand(1), DemandedEltsRHS));
+  }
+  if (!DemandedEltsLHS.isZero()) {
+    return ComputeForSingleOpFunc(I->getOperand(0), DemandedEltsLHS);
+  }
+  return ComputeForSingleOpFunc(I->getOperand(1), DemandedEltsRHS);
+}
+
 // Public so this can be used in `SimplifyDemandedUseBits`.
 KnownBits llvm::analyzeKnownBitsFromAndXorOr(const Operator *I,
                                              const KnownBits &KnownLHS,
@@ -1725,6 +1752,44 @@ static void computeKnownBitsFromOperator(const Operator *I,
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
         break;
+      case Intrinsic::x86_ssse3_phadd_d_128:
+      case Intrinsic::x86_ssse3_phadd_w_128:
+      case Intrinsic::x86_avx2_phadd_d:
+      case Intrinsic::x86_avx2_phadd_w: {
+        Known = computeKnownBitsForHorizontalOperation(
+            I, DemandedElts, Depth, Q,
+            [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+              return KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
+                                                 /*NUW=*/false, KnownLHS,
+                                                 KnownRHS);
+            });
+        break;
+      }
+      case Intrinsic::x86_ssse3_phadd_sw_128:
+      case Intrinsic::x86_avx2_phadd_sw: {
+        Known = computeKnownBitsForHorizontalOperation(I, DemandedElts, Depth,
+                                                       Q, KnownBits::sadd_sat);
+        break;
+      }
+      case Intrinsic::x86_ssse3_phsub_d_128:
+      case Intrinsic::x86_ssse3_phsub_w_128:
+      case Intrinsic::x86_avx2_phsub_d:
+      case Intrinsic::x86_avx2_phsub_w: {
+        Known = computeKnownBitsForHorizontalOperation(
+            I, DemandedElts, Depth, Q,
+            [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+              return KnownBits::computeForAddSub(/*Add=*/false, /*NSW=*/false,
+                                                 /*NUW=*/false, KnownLHS,
+                                                 KnownRHS);
+            });
+        break;
+      }
+      case Intrinsic::x86_ssse3_phsub_sw_128:
+      case Intrinsic::x86_avx2_phsub_sw: {
+        Known = computeKnownBitsForHorizontalOperation(I, DemandedElts, Depth,
+                                                       Q, KnownBits::ssub_sat);
+        break;
+      }
       case Intrinsic::riscv_vsetvli:
       case Intrinsic::riscv_vsetvlimax: {
         bool HasAVL = II->getIntrinsicID() == Intrinsic::riscv_vsetvli;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 917094267d05a..e00cb8dd6b897 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -541,6 +541,34 @@ void llvm::processShuffleMasks(
   }
 }
 
+void llvm::getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth,
+                                               const APInt &DemandedElts,
+                                               APInt &DemandedLHS,
+                                               APInt &DemandedRHS) {
+  assert(VectorBitWidth >= 128 && "Vectors smaller than 128 bit not supported");
+  int NumLanes = VectorBitWidth / 128;
+  int NumElts = DemandedElts.getBitWidth();
+  int NumEltsPerLane = NumElts / NumLanes;
+  int HalfEltsPerLane = NumEltsPerLane / 2;
+
+  DemandedLHS = APInt::getZero(NumElts);
+  DemandedRHS = APInt::getZero(NumElts);
+
+  // Map DemandedElts to the horizontal operands.
+  for (int Idx = 0; Idx != NumElts; ++Idx) {
+    if (!DemandedElts[Idx])
+      continue;
+    int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
+    int LocalIdx = Idx % NumEltsPerLane;
+    if (LocalIdx < HalfEltsPerLane) {
+      DemandedLHS.setBit(LaneIdx + 2 * LocalIdx);
+    } else {
+      LocalIdx -= HalfEltsPerLane;
+      DemandedRHS.setBit(LaneIdx + 2 * LocalIdx);
+    }
+  }
+}
+
 MapVector<Instruction *, uint64_t>
 llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
                                const TargetTransformInfo *TTI) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ecc5b3b3bf840..34583fa332a89 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5180,29 +5180,10 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
 // Split the demanded elts of a HADD/HSUB node between its operands.
 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
                                  APInt &DemandedLHS, APInt &DemandedRHS) {
-  int NumLanes = VT.getSizeInBits() / 128;
-  int NumElts = DemandedElts.getBitWidth();
-  int NumEltsPerLane = NumElts / NumLanes;
-  int HalfEltsPerLane = NumEltsPerLane / 2;
-
-  DemandedLHS = APInt::getZero(NumElts);
-  DemandedRHS = APInt::getZero(NumElts);
-
-  // Map DemandedElts to the horizontal operands.
-  for (int Idx = 0; Idx != NumElts; ++Idx) {
-    if (!DemandedElts[Idx])
-      continue;
-    int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
-    int LocalIdx = Idx % NumEltsPerLane;
-    if (LocalIdx < HalfEltsPerLane) {
-      DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
-      DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
-    } else {
-      LocalIdx -= HalfEltsPerLane;
-      DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
-      DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
-    }
-  }
+  getHorizDemandedEltsForFirstOperand(VT.getSizeInBits(), DemandedElts,
+                                      DemandedLHS, DemandedRHS);
+  DemandedLHS |= DemandedLHS << 1;
+  DemandedRHS |= DemandedRHS << 1;
 }
 
 /// Calculates the shuffle mask corresponding to the target-specific opcode.
@@ -36953,6 +36934,34 @@ static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
   Known = Known.zext(64);
 }
 
+static KnownBits computeKnownBitsForHorizontalOperation(
+    const SDValue Op, const APInt &DemandedElts, unsigned Depth,
+    unsigned OpIndexStart, const SelectionDAG &DAG,
+    const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
+        KnownBitsFunc) {
+  APInt DemandedEltsLHS, DemandedEltsRHS;
+  getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
+                                      DemandedElts, DemandedEltsLHS,
+                                      DemandedEltsRHS);
+
+  const auto ComputeForSingleOpFunc =
+      [&DAG, Depth, KnownBitsFunc](const SDValue &Op, APInt &DemandedEltsOp) {
+        return KnownBitsFunc(
+            DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
+            DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
+      };
+
+  if (!DemandedEltsLHS.isZero() && !DemandedEltsRHS.isZero()) {
+    return ComputeForSingleOpFunc(Op.getOperand(OpIndexStart), DemandedEltsLHS)
+        .intersectWith(ComputeForSingleOpFunc(Op.getOperand(OpIndexStart + 1),
+                                              DemandedEltsRHS));
+  }
+  if (!DemandedEltsLHS.isZero()) {
+    return ComputeForSingleOpFunc(Op.getOperand(OpIndexStart), DemandedEltsLHS);
+  }
+  return ComputeForSingleOpFunc(Op.getOperand(OpIndexStart + 1), DemandedEltsRHS);
+}
+
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
@@ -37262,6 +37271,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::HADD:
+  case X86ISD::HSUB: {
+    Known = computeKnownBitsForHorizontalOperation(
+        Op, DemandedElts, Depth, /*OpIndexStart=*/0, DAG,
+        [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
+          return KnownBits::computeForAddSub(
+              /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
+              KnownLHS, KnownRHS);
+        });
+    break;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     switch (Op->getConstantOperandVal(0)) {
     case Intrinsic::x86_sse2_psad_bw:
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll
index 663e9ed01888d..e2fe873d715cd 100644
--- a/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll
+++ b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll
@@ -5,12 +5,7 @@ define <4 x i1> @hadd_and_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hadd_and_eq_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> zeroinitializer
 ;
 entry:
   %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -25,12 +20,7 @@ define <8 x i1> @hadd_and_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -45,12 +35,7 @@ define <8 x i1> @hadd_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16_sat(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -65,12 +50,7 @@ define <8 x i1> @hadd_and_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i32(
 ; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i32> [[X]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <8 x i32> [[Y]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP2]], <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i32> [[TMP3]], <i32 3, i32 4, i32 5, i32 6, i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> zeroinitializer
 ;
 entry:
   %and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -85,12 +65,7 @@ define <16 x i1> @hadd_and_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -105,12 +80,7 @@ define <16 x i1> @hadd_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16_sat(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i16> [[X]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i16> [[Y]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i16> [[TMP2]], <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i16> [[TMP3]], <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 0>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -125,12 +95,7 @@ define <4 x i1> @hsub_trunc_eq_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hsub_trunc_eq_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <4 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> zeroinitializer
 ;
 entry:
   %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -145,12 +110,7 @@ define <8 x i1> @hsub_trunc_eq_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i16(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -165,13 +125,7 @@ define <8 x i1> @hsub_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: define <8 x i1> @hsub_and_eq_v8i16_sat(
 ; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR1:%.*]] = or <8 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[OR2:%.*]] = or <8 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[AND1:%.*]] = and <8 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[AND2:%.*]] = and <8 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[HSUB:%.*]] = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> [[AND1]], <8 x i16> [[AND2]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp slt <8 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %or1 = or <8 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
@@ -187,12 +141,7 @@ define <8 x i1> @hsub_trunc_eq_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i32(
 ; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i32> [[X]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i32> [[Y]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <8 x i16> [[CONV]], <i16 3, i16 4, i16 5, i16 6, i16 3, i16 4, i16 5, i16 6>
-; CHECK-NEXT:    ret <8 x i1> [[RET]]
+; CHECK-NEXT:    ret <8 x i1> zeroinitializer
 ;
 entry:
   %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -207,12 +156,7 @@ define <16 x i1> @hsub_trunc_eq_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hsub_trunc_eq_v16i16(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = or <16 x i16> [[X]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i16> [[Y]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[TMP0]], <16 x i16> [[TMP1]])
-; CHECK-NEXT:    [[CONV:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq <16 x i8> [[CONV]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 0>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>
 ;
 entry:
   %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -227,13 +171,7 @@ define <16 x i1> @hsub_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: define <16 x i1> @hsub_and_eq_v16i16_sat(
 ; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR1:%.*]] = or <16 x i16> [[X]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[OR2:%.*]] = or <16 x i16> [[Y]], <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
-; CHECK-NEXT:    [[AND1:%.*]] = and <16 x i16> [[OR1]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[AND2:%.*]] = and <16 x i16> [[OR2]], <i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3, i16 7, i16 3>
-; CHECK-NEXT:    [[HSUB:%.*]] = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[AND1]], <16 x i16> [[AND2]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp slt <16 x i16> [[HSUB]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    ret <16 x i1> [[RET]]
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %or1 = or <16 x i16> %x, <i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0, i16 3, i16 0>
@@ -249,11 +187,7 @@ define <4 x i1> @hadd_shuffle_2st_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[X]], <i32 -1, i32 -1, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[Y]])
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 5, i32 6>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3>
@@ -268,11 +202,7 @@ define <4 x i1> @hadd_shuffle_4th_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_v4i32(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[Y]], <i32 -1, i32 -1, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[X]], <4 x i32> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    ret <4 x i1> [[RET]]
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %and1 = and <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
index 1056919488ba1..6376b4d599de7 100644
--- a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
+++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
@@ -4,14 +4,7 @@
 define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: hadd_select_v4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
-; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
-; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -29,7 +22,7 @@ define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -46,9 +39,8 @@ define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -66,7 +58,6 @@ define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -82,15 +73,7 @@ entry:
 define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: hsub_select_shl_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpslld $16, %xmm0, %xmm1
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9]
-; CHECK-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
   %or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -104,11 +87,7 @@ define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) {
 define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: hsub_trunc_v8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; CHECK-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -121,14 +100,7 @@ entry:
 define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: hsub_trunc_v8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -141,14 +113,7 @@ entry:
 define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
 ; CHECK-LABEL: hsub_trunc_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
diff --git a/llvm/test/CodeGen/X86/pr53247.ll b/llvm/test/CodeGen/X86/pr53247.ll
index 2fc2ffb414e0e..cb5e699c8da5e 100644
--- a/llvm/test/CodeGen/X86/pr53247.ll
+++ b/llvm/test/CodeGen/X86/pr53247.ll
@@ -5,18 +5,12 @@
 define i32 @PR53247(){
 ; SSE-LABEL: PR53247:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pxor %xmm0, %xmm0
-; SSE-NEXT:    phaddd %xmm0, %xmm0
-; SSE-NEXT:    phaddd %xmm0, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR53247:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    retq
 entry:
   %0 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer)
diff --git a/llvm/unittests/Analysis/VectorUtilsTest.cpp b/llvm/unittests/Analysis/VectorUtilsTest.cpp
index 14958aa646a04..48b4d37558af1 100644
--- a/llvm/unittests/Analysis/VectorUtilsTest.cpp
+++ b/llvm/unittests/Analysis/VectorUtilsTest.cpp
@@ -242,6 +242,30 @@ TEST_F(BasicTest, getShuffleDemandedElts) {
   EXPECT_EQ(RHS.getZExtValue(), 0x9U);
 }
 
+TEST_F(BasicTest, getHorizontalDemandedEltsForFirstOperand) {
+  APInt LHS, RHS;
+
+  getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b0000), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0000U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0000U);
+
+  getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b0001), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0001U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0000U);
+
+  getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b1000), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0000U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0100U);
+
+  getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b0110), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0100U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0001U);
+
+  getHorizDemandedEltsForFirstOperand(256, APInt(4, 0b0100), LHS, RHS);
+  EXPECT_EQ(LHS.getZExtValue(), 0b0100U);
+  EXPECT_EQ(RHS.getZExtValue(), 0b0000U);
+}
+
 TEST_F(BasicTest, getSplatIndex) {
   EXPECT_EQ(getSplatIndex({0,0,0}), 0);
   EXPECT_EQ(getSplatIndex({1,0,0}), -1);     // no splat



More information about the llvm-commits mailing list