[llvm] [InstCombine] Canonicalize `abs(sub(ext(X),ext(Y)))` -> `ext(sub(max(X,Y),min(X,Y)))` (PR #162296)

Tue Oct 7 07:53:08 PDT 2025

https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/162296

This fold pushes the extension to after the abs. This form generates identical scalar code, but is more profitable for vectorization due to the smaller element type. This allows higher VFs to be selected and avoids expensive vector extends.

Proofs: https://alive2.llvm.org/ce/z/rChrWe, https://alive2.llvm.org/ce/z/D5E4bJ

>From 3a66e8867154e7aa4c6b743601121e8f52524247 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 7 Oct 2025 14:29:26 +0000
Subject: [PATCH] [InstCombine] Canonicalize `abs(sub(ext(X),ext(Y)))` ->
 `ext(sub(max(X,Y),min(X,Y)))`

This fold pushes the extension to after the abs. This form generates
identical scalar code, but is more profitable for vectorization due to
the smaller element type. This allows higher VFs to be selected and
avoids expensive vector extends.

Proofs: https://alive2.llvm.org/ce/z/rChrWe, https://alive2.llvm.org/ce/z/D5E4bJ
---
 .../InstCombine/InstCombineCalls.cpp          |  17 +
 .../Transforms/InstCombine/abs-of-extend.ll   | 104 +++++
 llvm/test/Transforms/InstCombine/icmp.ll      |   8 +-
 .../PhaseOrdering/AArch64/udotabd.ll          | 402 ++++++++----------
 4 files changed, 310 insertions(+), 221 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/abs-of-extend.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e1e24a99d0474..d5e78508d4ad7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1920,6 +1920,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (match(IIOperand, m_SRem(m_Value(X), m_APInt(C))) && *C == 2)
       return BinaryOperator::CreateAnd(X, ConstantInt::get(II->getType(), 1));
 
+    // abs (sub (sext X, sext Y)) -> zext (sub (smax (x, y) - smin(x, y)))
+    bool AbsSExtDiff = match(
+        IIOperand, m_OneUse(m_Sub(m_SExt(m_Value(X)), m_SExt(m_Value(Y)))));
+    // abs (sub (zext X, zext Y)) -> zext (sub (umax (x, y) - umin(x, y)))
+    bool AbsZExtDiff =
+        !AbsSExtDiff && match(IIOperand, m_OneUse(m_Sub(m_ZExt(m_Value(X)),
+                                                        m_ZExt(m_Value(Y)))));
+    if ((AbsSExtDiff || AbsZExtDiff) && X->getType() == Y->getType()) {
+      bool IsSigned = AbsSExtDiff;
+      Value *Max = Builder.CreateBinaryIntrinsic(
+          IsSigned ? Intrinsic::smax : Intrinsic::umax, X, Y);
+      Value *Min = Builder.CreateBinaryIntrinsic(
+          IsSigned ? Intrinsic::smin : Intrinsic::umin, X, Y);
+      Value *Sub = Builder.CreateSub(Max, Min);
+      return CastInst::Create(Instruction::ZExt, Sub, II->getType());
+    }
+
     break;
   }
   case Intrinsic::umin: {
diff --git a/llvm/test/Transforms/InstCombine/abs-of-extend.ll b/llvm/test/Transforms/InstCombine/abs-of-extend.ll
new file mode 100644
index 0000000000000..431055ec39dad
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/abs-of-extend.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; abs (sub (sext X, sext Y)) -> zext (sub (smax (x, y) - smin(x, y)))
+; Proof: https://alive2.llvm.org/ce/z/D5E4bJ
+
+; abs (sub (zext X, zext Y)) -> zext (sub (umax (x, y) - umin(x, y)))
+; Proof: https://alive2.llvm.org/ce/z/rChrWe
+
+define i32 @sext_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: define i32 @sext_i8(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.smin.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ABS:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+  %ext.a = sext i8 %a to i32
+  %ext.b = sext i8 %b to i32
+  %sub = sub nsw i32 %ext.a, %ext.b
+  %abs = call i32 @llvm.abs(i32 %sub, i1 true)
+  ret i32 %abs
+}
+
+define i32 @zext_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: define i32 @zext_i8(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.umin.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ABS:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+  %ext.a = zext i8 %a to i32
+  %ext.b = zext i8 %b to i32
+  %sub = sub nsw i32 %ext.a, %ext.b
+  %abs = call i32 @llvm.abs(i32 %sub, i1 true)
+  ret i32 %abs
+}
+
+define i64 @zext_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: define i64 @zext_i32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umin.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ABS:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[ABS]]
+;
+  %ext.a = zext i32 %a to i64
+  %ext.b = zext i32 %b to i64
+  %sub = sub nsw i64 %ext.a, %ext.b
+  %abs = call i64 @llvm.abs(i64 %sub, i1 true)
+  ret i64 %abs
+}
+
+define <16 x i32> @vec_source(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: define <16 x i32> @vec_source(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ABS:%.*]] = zext <16 x i8> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[ABS]]
+;
+  %ext.a = sext <16 x i8> %a to <16 x i32>
+  %ext.b = sext <16 x i8> %b to <16 x i32>
+  %sub = sub nsw <16 x i32> %ext.a, %ext.b
+  %abs = call <16 x i32> @llvm.abs(<16 x i32> %sub, i1 true)
+  ret <16 x i32> %abs
+}
+
+define i32 @mixed_extend(i8 %a, i8 %b) {
+; CHECK-LABEL: define i32 @mixed_extend(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[EXT_A:%.*]] = sext i8 [[A]] to i32
+; CHECK-NEXT:    [[EXT_B:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[EXT_A]], [[EXT_B]]
+; CHECK-NEXT:    [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+  %ext.a = sext i8 %a to i32
+  %ext.b = zext i8 %b to i32
+  %sub = sub nsw i32 %ext.a, %ext.b
+  %abs = call i32 @llvm.abs(i32 %sub, i1 true)
+  ret i32 %abs
+}
+
+define i32 @mixed_source_types(i16 %a, i8 %b) {
+; CHECK-LABEL: define i32 @mixed_source_types(
+; CHECK-SAME: i16 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[EXT_A:%.*]] = zext i16 [[A]] to i32
+; CHECK-NEXT:    [[EXT_B:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[EXT_A]], [[EXT_B]]
+; CHECK-NEXT:    [[ABS:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+  %ext.a = zext i16 %a to i32
+  %ext.b = zext i8 %b to i32
+  %sub = sub nsw i32 %ext.a, %ext.b
+  %abs = call i32 @llvm.abs(i32 %sub, i1 true)
+  ret i32 %abs
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 696208b903798..ee482d6698457 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -4065,10 +4065,10 @@ define <2 x i1> @f4_vec(<2 x i64> %a, <2 x i64> %b) {
 define i32 @f5(i8 %a, i8 %b) {
 ; CHECK-LABEL: define i32 @f5(
 ; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[A]] to i32
-; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[B]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; CHECK-NEXT:    [[SUB7_SUB:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.umin.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[SUB7_SUB:%.*]] = zext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[SUB7_SUB]]
 ;
   %conv = zext i8 %a to i32
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
index 4c7e39d31b5c6..7ae07a5b967ff 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
@@ -12,176 +12,160 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
 ; CHECK-O3-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
 ; CHECK-O3-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
-; CHECK-O3-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
-; CHECK-O3-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 false)
-; CHECK-O3-NEXT:    [[TMP6:%.*]] = zext <16 x i16> [[TMP5]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+; CHECK-O3-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+; CHECK-O3-NEXT:    [[TMP4:%.*]] = sub <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-O3-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP7:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
 ; CHECK-O3-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP12:%.*]] = sub nsw <16 x i16> [[TMP9]], [[TMP11]]
-; CHECK-O3-NEXT:    [[TMP13:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP12]], i1 false)
-; CHECK-O3-NEXT:    [[TMP14:%.*]] = zext <16 x i16> [[TMP13]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP9:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP12]], <16 x i8> [[TMP8]])
+; CHECK-O3-NEXT:    [[TMP10:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP12]], <16 x i8> [[TMP8]])
+; CHECK-O3-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP9]], [[TMP10]]
+; CHECK-O3-NEXT:    [[TMP14:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]])
 ; CHECK-O3-NEXT:    [[OP_RDX_1:%.*]] = add i32 [[TMP15]], [[TMP7]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP16:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP20:%.*]] = sub nsw <16 x i16> [[TMP17]], [[TMP19]]
-; CHECK-O3-NEXT:    [[TMP21:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP20]], i1 false)
-; CHECK-O3-NEXT:    [[TMP22:%.*]] = zext <16 x i16> [[TMP21]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP19:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP16:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP19]], <16 x i8> [[TMP20]])
+; CHECK-O3-NEXT:    [[TMP17:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP19]], <16 x i8> [[TMP20]])
+; CHECK-O3-NEXT:    [[TMP18:%.*]] = sub <16 x i8> [[TMP16]], [[TMP17]]
+; CHECK-O3-NEXT:    [[TMP22:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP22]])
 ; CHECK-O3-NEXT:    [[OP_RDX_2:%.*]] = add i32 [[TMP23]], [[OP_RDX_1]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
-; CHECK-O3-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 false)
-; CHECK-O3-NEXT:    [[TMP30:%.*]] = zext <16 x i16> [[TMP29]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP27:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP21]], <16 x i8> [[TMP26]])
+; CHECK-O3-NEXT:    [[TMP24:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP21]], <16 x i8> [[TMP26]])
+; CHECK-O3-NEXT:    [[TMP25:%.*]] = sub <16 x i8> [[TMP27]], [[TMP24]]
+; CHECK-O3-NEXT:    [[TMP30:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP31:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP30]])
 ; CHECK-O3-NEXT:    [[OP_RDX_3:%.*]] = add i32 [[TMP31]], [[OP_RDX_2]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP34:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[TMP34]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP36:%.*]] = sub nsw <16 x i16> [[TMP33]], [[TMP35]]
-; CHECK-O3-NEXT:    [[TMP37:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP36]], i1 false)
-; CHECK-O3-NEXT:    [[TMP38:%.*]] = zext <16 x i16> [[TMP37]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP28:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP33:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP28]], <16 x i8> [[TMP29]])
+; CHECK-O3-NEXT:    [[TMP34:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP28]], <16 x i8> [[TMP29]])
+; CHECK-O3-NEXT:    [[TMP32:%.*]] = sub <16 x i8> [[TMP33]], [[TMP34]]
+; CHECK-O3-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP39:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP38]])
 ; CHECK-O3-NEXT:    [[OP_RDX_4:%.*]] = add i32 [[TMP39]], [[OP_RDX_3]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP41:%.*]] = zext <16 x i8> [[TMP40]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP44:%.*]] = sub nsw <16 x i16> [[TMP41]], [[TMP43]]
-; CHECK-O3-NEXT:    [[TMP45:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP44]], i1 false)
-; CHECK-O3-NEXT:    [[TMP46:%.*]] = zext <16 x i16> [[TMP45]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP35:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP36:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP37:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP35]], <16 x i8> [[TMP36]])
+; CHECK-O3-NEXT:    [[TMP40:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP35]], <16 x i8> [[TMP36]])
+; CHECK-O3-NEXT:    [[TMP41:%.*]] = sub <16 x i8> [[TMP37]], [[TMP40]]
+; CHECK-O3-NEXT:    [[TMP46:%.*]] = zext <16 x i8> [[TMP41]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP47:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP46]])
 ; CHECK-O3-NEXT:    [[OP_RDX_5:%.*]] = add i32 [[TMP47]], [[OP_RDX_4]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP48:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP51:%.*]] = zext <16 x i8> [[TMP50]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP52:%.*]] = sub nsw <16 x i16> [[TMP49]], [[TMP51]]
-; CHECK-O3-NEXT:    [[TMP53:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP52]], i1 false)
-; CHECK-O3-NEXT:    [[TMP54:%.*]] = zext <16 x i16> [[TMP53]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP43:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP44:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP42]], <16 x i8> [[TMP43]])
+; CHECK-O3-NEXT:    [[TMP45:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP42]], <16 x i8> [[TMP43]])
+; CHECK-O3-NEXT:    [[TMP48:%.*]] = sub <16 x i8> [[TMP44]], [[TMP45]]
+; CHECK-O3-NEXT:    [[TMP54:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP55:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP54]])
 ; CHECK-O3-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[TMP55]], [[OP_RDX_5]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP58:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP59:%.*]] = zext <16 x i8> [[TMP58]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i16> [[TMP57]], [[TMP59]]
-; CHECK-O3-NEXT:    [[TMP61:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP60]], i1 false)
-; CHECK-O3-NEXT:    [[TMP62:%.*]] = zext <16 x i16> [[TMP61]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP49:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP51:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP49]], <16 x i8> [[TMP50]])
+; CHECK-O3-NEXT:    [[TMP52:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP49]], <16 x i8> [[TMP50]])
+; CHECK-O3-NEXT:    [[TMP53:%.*]] = sub <16 x i8> [[TMP51]], [[TMP52]]
+; CHECK-O3-NEXT:    [[TMP62:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP63:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP62]])
 ; CHECK-O3-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[TMP63]], [[OP_RDX_6]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP65:%.*]] = zext <16 x i8> [[TMP64]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP66:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP67:%.*]] = zext <16 x i8> [[TMP66]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP68:%.*]] = sub nsw <16 x i16> [[TMP65]], [[TMP67]]
-; CHECK-O3-NEXT:    [[TMP69:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP68]], i1 false)
-; CHECK-O3-NEXT:    [[TMP70:%.*]] = zext <16 x i16> [[TMP69]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP57:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP58:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP56]], <16 x i8> [[TMP57]])
+; CHECK-O3-NEXT:    [[TMP59:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP56]], <16 x i8> [[TMP57]])
+; CHECK-O3-NEXT:    [[TMP60:%.*]] = sub <16 x i8> [[TMP58]], [[TMP59]]
+; CHECK-O3-NEXT:    [[TMP70:%.*]] = zext <16 x i8> [[TMP60]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP71:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP70]])
 ; CHECK-O3-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[TMP71]], [[OP_RDX_7]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP72:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP73:%.*]] = zext <16 x i8> [[TMP72]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP74:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP75:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP76:%.*]] = sub nsw <16 x i16> [[TMP73]], [[TMP75]]
-; CHECK-O3-NEXT:    [[TMP77:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP76]], i1 false)
-; CHECK-O3-NEXT:    [[TMP78:%.*]] = zext <16 x i16> [[TMP77]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP68:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP65:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP68]], <16 x i8> [[TMP64]])
+; CHECK-O3-NEXT:    [[TMP66:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP68]], <16 x i8> [[TMP64]])
+; CHECK-O3-NEXT:    [[TMP67:%.*]] = sub <16 x i8> [[TMP65]], [[TMP66]]
+; CHECK-O3-NEXT:    [[TMP78:%.*]] = zext <16 x i8> [[TMP67]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP79:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP78]])
 ; CHECK-O3-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[TMP79]], [[OP_RDX_8]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP80:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP81:%.*]] = zext <16 x i8> [[TMP80]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP83:%.*]] = zext <16 x i8> [[TMP82]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP84:%.*]] = sub nsw <16 x i16> [[TMP81]], [[TMP83]]
-; CHECK-O3-NEXT:    [[TMP85:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP84]], i1 false)
-; CHECK-O3-NEXT:    [[TMP86:%.*]] = zext <16 x i16> [[TMP85]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP75:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP76:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP72:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP75]], <16 x i8> [[TMP76]])
+; CHECK-O3-NEXT:    [[TMP73:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP75]], <16 x i8> [[TMP76]])
+; CHECK-O3-NEXT:    [[TMP74:%.*]] = sub <16 x i8> [[TMP72]], [[TMP73]]
+; CHECK-O3-NEXT:    [[TMP86:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP87:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP86]])
 ; CHECK-O3-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[TMP87]], [[OP_RDX_9]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP88:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP89:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP90:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP91:%.*]] = zext <16 x i8> [[TMP90]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP92:%.*]] = sub nsw <16 x i16> [[TMP89]], [[TMP91]]
-; CHECK-O3-NEXT:    [[TMP93:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP92]], i1 false)
-; CHECK-O3-NEXT:    [[TMP94:%.*]] = zext <16 x i16> [[TMP93]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP83:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP77]], <16 x i8> [[TMP82]])
+; CHECK-O3-NEXT:    [[TMP80:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP77]], <16 x i8> [[TMP82]])
+; CHECK-O3-NEXT:    [[TMP81:%.*]] = sub <16 x i8> [[TMP83]], [[TMP80]]
+; CHECK-O3-NEXT:    [[TMP94:%.*]] = zext <16 x i8> [[TMP81]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP95:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP94]])
 ; CHECK-O3-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[TMP95]], [[OP_RDX_10]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP97:%.*]] = zext <16 x i8> [[TMP96]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP99:%.*]] = zext <16 x i8> [[TMP98]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP100:%.*]] = sub nsw <16 x i16> [[TMP97]], [[TMP99]]
-; CHECK-O3-NEXT:    [[TMP101:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP100]], i1 false)
-; CHECK-O3-NEXT:    [[TMP102:%.*]] = zext <16 x i16> [[TMP101]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP84:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP89:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP84]], <16 x i8> [[TMP85]])
+; CHECK-O3-NEXT:    [[TMP90:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP84]], <16 x i8> [[TMP85]])
+; CHECK-O3-NEXT:    [[TMP88:%.*]] = sub <16 x i8> [[TMP89]], [[TMP90]]
+; CHECK-O3-NEXT:    [[TMP102:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP103:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP102]])
 ; CHECK-O3-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[TMP103]], [[OP_RDX_11]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP104:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP105:%.*]] = zext <16 x i8> [[TMP104]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP107:%.*]] = zext <16 x i8> [[TMP106]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP108:%.*]] = sub nsw <16 x i16> [[TMP105]], [[TMP107]]
-; CHECK-O3-NEXT:    [[TMP109:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP108]], i1 false)
-; CHECK-O3-NEXT:    [[TMP110:%.*]] = zext <16 x i16> [[TMP109]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP91:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP92:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP93:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP91]], <16 x i8> [[TMP92]])
+; CHECK-O3-NEXT:    [[TMP96:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP91]], <16 x i8> [[TMP92]])
+; CHECK-O3-NEXT:    [[TMP97:%.*]] = sub <16 x i8> [[TMP93]], [[TMP96]]
+; CHECK-O3-NEXT:    [[TMP110:%.*]] = zext <16 x i8> [[TMP97]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP111:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP110]])
 ; CHECK-O3-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[TMP111]], [[OP_RDX_12]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP112:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP113:%.*]] = zext <16 x i8> [[TMP112]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP114:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP115:%.*]] = zext <16 x i8> [[TMP114]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP116:%.*]] = sub nsw <16 x i16> [[TMP113]], [[TMP115]]
-; CHECK-O3-NEXT:    [[TMP117:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP116]], i1 false)
-; CHECK-O3-NEXT:    [[TMP118:%.*]] = zext <16 x i16> [[TMP117]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP99:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP100:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP98]], <16 x i8> [[TMP99]])
+; CHECK-O3-NEXT:    [[TMP101:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP98]], <16 x i8> [[TMP99]])
+; CHECK-O3-NEXT:    [[TMP104:%.*]] = sub <16 x i8> [[TMP100]], [[TMP101]]
+; CHECK-O3-NEXT:    [[TMP118:%.*]] = zext <16 x i8> [[TMP104]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]])
 ; CHECK-O3-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[TMP119]], [[OP_RDX_13]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
 ; CHECK-O3-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP120:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP121:%.*]] = zext <16 x i8> [[TMP120]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP122:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP123:%.*]] = zext <16 x i8> [[TMP122]] to <16 x i16>
-; CHECK-O3-NEXT:    [[TMP124:%.*]] = sub nsw <16 x i16> [[TMP121]], [[TMP123]]
-; CHECK-O3-NEXT:    [[TMP125:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP124]], i1 false)
-; CHECK-O3-NEXT:    [[TMP126:%.*]] = zext <16 x i16> [[TMP125]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP105:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP107:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP105]], <16 x i8> [[TMP106]])
+; CHECK-O3-NEXT:    [[TMP108:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP105]], <16 x i8> [[TMP106]])
+; CHECK-O3-NEXT:    [[TMP109:%.*]] = sub <16 x i8> [[TMP107]], [[TMP108]]
+; CHECK-O3-NEXT:    [[TMP126:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i32>
 ; CHECK-O3-NEXT:    [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP126]])
 ; CHECK-O3-NEXT:    [[OP_RDX_15:%.*]] = add i32 [[TMP127]], [[OP_RDX_14]]
 ; CHECK-O3-NEXT:    ret i32 [[OP_RDX_15]]
@@ -192,176 +176,160 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-LTO-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
 ; CHECK-LTO-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
 ; CHECK-LTO-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
-; CHECK-LTO-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
-; CHECK-LTO-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP36:%.*]] = zext nneg <16 x i16> [[TMP5]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+; CHECK-LTO-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+; CHECK-LTO-NEXT:    [[TMP4:%.*]] = sub <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-LTO-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP44:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP36]])
 ; CHECK-LTO-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i16> [[TMP7]], [[TMP9]]
-; CHECK-LTO-NEXT:    [[TMP11:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP10]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP52:%.*]] = zext nneg <16 x i16> [[TMP11]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP9:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; CHECK-LTO-NEXT:    [[TMP10:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; CHECK-LTO-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP9]], [[TMP10]]
+; CHECK-LTO-NEXT:    [[TMP52:%.*]] = zext <16 x i8> [[TMP11]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP60:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP52]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_1:%.*]] = add i32 [[TMP60]], [[TMP44]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP16:%.*]] = sub nsw <16 x i16> [[TMP13]], [[TMP15]]
-; CHECK-LTO-NEXT:    [[TMP17:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP16]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP68:%.*]] = zext nneg <16 x i16> [[TMP17]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP15:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP16:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP14]], <16 x i8> [[TMP15]])
+; CHECK-LTO-NEXT:    [[TMP17:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP14]], <16 x i8> [[TMP15]])
+; CHECK-LTO-NEXT:    [[TMP18:%.*]] = sub <16 x i8> [[TMP16]], [[TMP17]]
+; CHECK-LTO-NEXT:    [[TMP68:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP76:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP68]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_2:%.*]] = add i32 [[OP_RDX_1]], [[TMP76]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP22:%.*]] = sub nsw <16 x i16> [[TMP19]], [[TMP21]]
-; CHECK-LTO-NEXT:    [[TMP23:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP22]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP84:%.*]] = zext nneg <16 x i16> [[TMP23]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP22:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP23:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP21]], <16 x i8> [[TMP22]])
+; CHECK-LTO-NEXT:    [[TMP24:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP21]], <16 x i8> [[TMP22]])
+; CHECK-LTO-NEXT:    [[TMP25:%.*]] = sub <16 x i8> [[TMP23]], [[TMP24]]
+; CHECK-LTO-NEXT:    [[TMP84:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP92:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP84]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_3:%.*]] = add i32 [[OP_RDX_2]], [[TMP92]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
-; CHECK-LTO-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP100:%.*]] = zext nneg <16 x i16> [[TMP29]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP28:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP30:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP28]], <16 x i8> [[TMP29]])
+; CHECK-LTO-NEXT:    [[TMP31:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP28]], <16 x i8> [[TMP29]])
+; CHECK-LTO-NEXT:    [[TMP32:%.*]] = sub <16 x i8> [[TMP30]], [[TMP31]]
+; CHECK-LTO-NEXT:    [[TMP100:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP108:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP100]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_4:%.*]] = add i32 [[OP_RDX_3]], [[TMP108]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP34:%.*]] = sub nsw <16 x i16> [[TMP31]], [[TMP33]]
-; CHECK-LTO-NEXT:    [[TMP35:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP34]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP116:%.*]] = zext nneg <16 x i16> [[TMP35]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP35:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP37:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP35]], <16 x i8> [[TMP40]])
+; CHECK-LTO-NEXT:    [[TMP38:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP35]], <16 x i8> [[TMP40]])
+; CHECK-LTO-NEXT:    [[TMP39:%.*]] = sub <16 x i8> [[TMP37]], [[TMP38]]
+; CHECK-LTO-NEXT:    [[TMP116:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP117:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP116]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_5:%.*]] = add i32 [[OP_RDX_4]], [[TMP117]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP37:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP39:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i16> [[TMP38]], [[TMP40]]
-; CHECK-LTO-NEXT:    [[TMP42:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP43:%.*]] = zext nneg <16 x i16> [[TMP42]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP48:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP42]], <16 x i8> [[TMP47]])
+; CHECK-LTO-NEXT:    [[TMP45:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP42]], <16 x i8> [[TMP47]])
+; CHECK-LTO-NEXT:    [[TMP46:%.*]] = sub <16 x i8> [[TMP48]], [[TMP45]]
+; CHECK-LTO-NEXT:    [[TMP43:%.*]] = zext <16 x i8> [[TMP46]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP118:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP43]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[OP_RDX_5]], [[TMP118]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP45:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP46:%.*]] = zext <16 x i8> [[TMP45]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i16> [[TMP46]], [[TMP48]]
-; CHECK-LTO-NEXT:    [[TMP50:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP49]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP51:%.*]] = zext nneg <16 x i16> [[TMP50]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP49:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP54:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP49]], <16 x i8> [[TMP50]])
+; CHECK-LTO-NEXT:    [[TMP55:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP49]], <16 x i8> [[TMP50]])
+; CHECK-LTO-NEXT:    [[TMP53:%.*]] = sub <16 x i8> [[TMP54]], [[TMP55]]
+; CHECK-LTO-NEXT:    [[TMP51:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP120:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP51]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[OP_RDX_6]], [[TMP120]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP53:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP54:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP55:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i16> [[TMP54]], [[TMP56]]
-; CHECK-LTO-NEXT:    [[TMP58:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP57]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP59:%.*]] = zext nneg <16 x i16> [[TMP58]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP57:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP58:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP56]], <16 x i8> [[TMP57]])
+; CHECK-LTO-NEXT:    [[TMP61:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP56]], <16 x i8> [[TMP57]])
+; CHECK-LTO-NEXT:    [[TMP62:%.*]] = sub <16 x i8> [[TMP58]], [[TMP61]]
+; CHECK-LTO-NEXT:    [[TMP59:%.*]] = zext <16 x i8> [[TMP62]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP121:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP59]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[OP_RDX_7]], [[TMP121]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP61:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP62:%.*]] = zext <16 x i8> [[TMP61]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP64:%.*]] = zext <16 x i8> [[TMP63]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP65:%.*]] = sub nsw <16 x i16> [[TMP62]], [[TMP64]]
-; CHECK-LTO-NEXT:    [[TMP66:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP65]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP67:%.*]] = zext nneg <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP65:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP63]], <16 x i8> [[TMP64]])
+; CHECK-LTO-NEXT:    [[TMP66:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP63]], <16 x i8> [[TMP64]])
+; CHECK-LTO-NEXT:    [[TMP69:%.*]] = sub <16 x i8> [[TMP65]], [[TMP66]]
+; CHECK-LTO-NEXT:    [[TMP67:%.*]] = zext <16 x i8> [[TMP69]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP122:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP67]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[OP_RDX_8]], [[TMP122]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP69:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP70:%.*]] = zext <16 x i8> [[TMP69]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP70:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP71:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP72:%.*]] = zext <16 x i8> [[TMP71]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP73:%.*]] = sub nsw <16 x i16> [[TMP70]], [[TMP72]]
-; CHECK-LTO-NEXT:    [[TMP74:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP73]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP75:%.*]] = zext nneg <16 x i16> [[TMP74]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP72:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP70]], <16 x i8> [[TMP71]])
+; CHECK-LTO-NEXT:    [[TMP73:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP70]], <16 x i8> [[TMP71]])
+; CHECK-LTO-NEXT:    [[TMP74:%.*]] = sub <16 x i8> [[TMP72]], [[TMP73]]
+; CHECK-LTO-NEXT:    [[TMP75:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP123:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP75]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[OP_RDX_9]], [[TMP123]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP78:%.*]] = zext <16 x i8> [[TMP77]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP79:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP80:%.*]] = zext <16 x i8> [[TMP79]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP81:%.*]] = sub nsw <16 x i16> [[TMP78]], [[TMP80]]
-; CHECK-LTO-NEXT:    [[TMP82:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP81]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP83:%.*]] = zext nneg <16 x i16> [[TMP82]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP78:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP79:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP77]], <16 x i8> [[TMP78]])
+; CHECK-LTO-NEXT:    [[TMP80:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP77]], <16 x i8> [[TMP78]])
+; CHECK-LTO-NEXT:    [[TMP81:%.*]] = sub <16 x i8> [[TMP79]], [[TMP80]]
+; CHECK-LTO-NEXT:    [[TMP83:%.*]] = zext <16 x i8> [[TMP81]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP124:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP83]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[OP_RDX_10]], [[TMP124]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP86:%.*]] = zext <16 x i8> [[TMP85]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP87:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP88:%.*]] = zext <16 x i8> [[TMP87]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP89:%.*]] = sub nsw <16 x i16> [[TMP86]], [[TMP88]]
-; CHECK-LTO-NEXT:    [[TMP90:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP89]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP91:%.*]] = zext nneg <16 x i16> [[TMP90]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP89:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP86:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP89]], <16 x i8> [[TMP85]])
+; CHECK-LTO-NEXT:    [[TMP87:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP89]], <16 x i8> [[TMP85]])
+; CHECK-LTO-NEXT:    [[TMP88:%.*]] = sub <16 x i8> [[TMP86]], [[TMP87]]
+; CHECK-LTO-NEXT:    [[TMP91:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP125:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP91]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[OP_RDX_11]], [[TMP125]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP93:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP94:%.*]] = zext <16 x i8> [[TMP93]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP95:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP96:%.*]] = zext <16 x i8> [[TMP95]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP97:%.*]] = sub nsw <16 x i16> [[TMP94]], [[TMP96]]
-; CHECK-LTO-NEXT:    [[TMP98:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP97]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP99:%.*]] = zext nneg <16 x i16> [[TMP98]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP97:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP93:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP96]], <16 x i8> [[TMP97]])
+; CHECK-LTO-NEXT:    [[TMP94:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP96]], <16 x i8> [[TMP97]])
+; CHECK-LTO-NEXT:    [[TMP95:%.*]] = sub <16 x i8> [[TMP93]], [[TMP94]]
+; CHECK-LTO-NEXT:    [[TMP99:%.*]] = zext <16 x i8> [[TMP95]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP126:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP99]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[OP_RDX_12]], [[TMP126]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP101:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP102:%.*]] = zext <16 x i8> [[TMP101]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP103:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP104:%.*]] = zext <16 x i8> [[TMP103]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP105:%.*]] = sub nsw <16 x i16> [[TMP102]], [[TMP104]]
-; CHECK-LTO-NEXT:    [[TMP106:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP105]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP107:%.*]] = zext nneg <16 x i16> [[TMP106]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP104:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP98]], <16 x i8> [[TMP103]])
+; CHECK-LTO-NEXT:    [[TMP101:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP98]], <16 x i8> [[TMP103]])
+; CHECK-LTO-NEXT:    [[TMP102:%.*]] = sub <16 x i8> [[TMP104]], [[TMP101]]
+; CHECK-LTO-NEXT:    [[TMP107:%.*]] = zext <16 x i8> [[TMP102]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP107]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[OP_RDX_13]], [[TMP119]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
-; CHECK-LTO-NEXT:    [[TMP109:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP110:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP111:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-LTO-NEXT:    [[TMP112:%.*]] = zext <16 x i8> [[TMP111]] to <16 x i16>
-; CHECK-LTO-NEXT:    [[TMP113:%.*]] = sub nsw <16 x i16> [[TMP110]], [[TMP112]]
-; CHECK-LTO-NEXT:    [[TMP114:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP113]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP115:%.*]] = zext nneg <16 x i16> [[TMP114]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP105:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP110:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP105]], <16 x i8> [[TMP106]])
+; CHECK-LTO-NEXT:    [[TMP111:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP105]], <16 x i8> [[TMP106]])
+; CHECK-LTO-NEXT:    [[TMP109:%.*]] = sub <16 x i8> [[TMP110]], [[TMP111]]
+; CHECK-LTO-NEXT:    [[TMP115:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i32>
 ; CHECK-LTO-NEXT:    [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP115]])
 ; CHECK-LTO-NEXT:    [[OP_RDX_15:%.*]] = add i32 [[OP_RDX_14]], [[TMP127]]
 ; CHECK-LTO-NEXT:    ret i32 [[OP_RDX_15]]