[llvm] 3ef5348 - [AArch64] Add a phase-order test for dot patterns. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 11 13:20:11 PST 2025


Author: David Green
Date: 2025-02-11T21:20:07Z
New Revision: 3ef5348a0486def3e53ce37b9ed8b8ca020a3a12

URL: https://github.com/llvm/llvm-project/commit/3ef5348a0486def3e53ce37b9ed8b8ca020a3a12
DIFF: https://github.com/llvm/llvm-project/commit/3ef5348a0486def3e53ce37b9ed8b8ca020a3a12.diff

LOG: [AArch64] Add a phase-order test for dot patterns. NFC

Added: 
    llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
new file mode 100644
index 0000000000000..3496520c232aa
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -O3 < %s | FileCheck %s --check-prefixes=CHECK-O3
+; RUN: opt -S -passes="default<O3>,default<O3>" < %s | FileCheck %s --check-prefixes=CHECK-LTO
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2, i32 noundef %s_p2) #0 {
+; CHECK-O3-LABEL: define dso_local i32 @test(
+; CHECK-O3-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-O3-NEXT:  [[ENTRY:.*:]]
+; CHECK-O3-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
+; CHECK-O3-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
+; CHECK-O3-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-O3-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
+; CHECK-O3-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 false)
+; CHECK-O3-NEXT:    [[TMP6:%.*]] = zext <16 x i16> [[TMP5]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP7:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
+; CHECK-O3-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP12:%.*]] = sub nsw <16 x i16> [[TMP9]], [[TMP11]]
+; CHECK-O3-NEXT:    [[TMP13:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP12]], i1 false)
+; CHECK-O3-NEXT:    [[TMP14:%.*]] = zext <16 x i16> [[TMP13]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]])
+; CHECK-O3-NEXT:    [[OP_RDX_1:%.*]] = add i32 [[TMP15]], [[TMP7]]
+; CHECK-O3-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP16:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP20:%.*]] = sub nsw <16 x i16> [[TMP17]], [[TMP19]]
+; CHECK-O3-NEXT:    [[TMP21:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP20]], i1 false)
+; CHECK-O3-NEXT:    [[TMP22:%.*]] = zext <16 x i16> [[TMP21]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP22]])
+; CHECK-O3-NEXT:    [[OP_RDX_2:%.*]] = add i32 [[TMP23]], [[OP_RDX_1]]
+; CHECK-O3-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
+; CHECK-O3-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 false)
+; CHECK-O3-NEXT:    [[TMP30:%.*]] = zext <16 x i16> [[TMP29]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP31:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP30]])
+; CHECK-O3-NEXT:    [[OP_RDX_3:%.*]] = add i32 [[TMP31]], [[OP_RDX_2]]
+; CHECK-O3-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP34:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[TMP34]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP36:%.*]] = sub nsw <16 x i16> [[TMP33]], [[TMP35]]
+; CHECK-O3-NEXT:    [[TMP37:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP36]], i1 false)
+; CHECK-O3-NEXT:    [[TMP38:%.*]] = zext <16 x i16> [[TMP37]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP39:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP38]])
+; CHECK-O3-NEXT:    [[OP_RDX_4:%.*]] = add i32 [[TMP39]], [[OP_RDX_3]]
+; CHECK-O3-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP41:%.*]] = zext <16 x i8> [[TMP40]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP44:%.*]] = sub nsw <16 x i16> [[TMP41]], [[TMP43]]
+; CHECK-O3-NEXT:    [[TMP45:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP44]], i1 false)
+; CHECK-O3-NEXT:    [[TMP46:%.*]] = zext <16 x i16> [[TMP45]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP47:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP46]])
+; CHECK-O3-NEXT:    [[OP_RDX_5:%.*]] = add i32 [[TMP47]], [[OP_RDX_4]]
+; CHECK-O3-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP48:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP51:%.*]] = zext <16 x i8> [[TMP50]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP52:%.*]] = sub nsw <16 x i16> [[TMP49]], [[TMP51]]
+; CHECK-O3-NEXT:    [[TMP53:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP52]], i1 false)
+; CHECK-O3-NEXT:    [[TMP54:%.*]] = zext <16 x i16> [[TMP53]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP55:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP54]])
+; CHECK-O3-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[TMP55]], [[OP_RDX_5]]
+; CHECK-O3-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP58:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP59:%.*]] = zext <16 x i8> [[TMP58]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i16> [[TMP57]], [[TMP59]]
+; CHECK-O3-NEXT:    [[TMP61:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP60]], i1 false)
+; CHECK-O3-NEXT:    [[TMP62:%.*]] = zext <16 x i16> [[TMP61]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP63:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP62]])
+; CHECK-O3-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[TMP63]], [[OP_RDX_6]]
+; CHECK-O3-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP65:%.*]] = zext <16 x i8> [[TMP64]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP66:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP67:%.*]] = zext <16 x i8> [[TMP66]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP68:%.*]] = sub nsw <16 x i16> [[TMP65]], [[TMP67]]
+; CHECK-O3-NEXT:    [[TMP69:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP68]], i1 false)
+; CHECK-O3-NEXT:    [[TMP70:%.*]] = zext <16 x i16> [[TMP69]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP71:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP70]])
+; CHECK-O3-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[TMP71]], [[OP_RDX_7]]
+; CHECK-O3-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP72:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP73:%.*]] = zext <16 x i8> [[TMP72]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP74:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP75:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP76:%.*]] = sub nsw <16 x i16> [[TMP73]], [[TMP75]]
+; CHECK-O3-NEXT:    [[TMP77:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP76]], i1 false)
+; CHECK-O3-NEXT:    [[TMP78:%.*]] = zext <16 x i16> [[TMP77]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP79:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP78]])
+; CHECK-O3-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[TMP79]], [[OP_RDX_8]]
+; CHECK-O3-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP80:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP81:%.*]] = zext <16 x i8> [[TMP80]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP83:%.*]] = zext <16 x i8> [[TMP82]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP84:%.*]] = sub nsw <16 x i16> [[TMP81]], [[TMP83]]
+; CHECK-O3-NEXT:    [[TMP85:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP84]], i1 false)
+; CHECK-O3-NEXT:    [[TMP86:%.*]] = zext <16 x i16> [[TMP85]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP87:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP86]])
+; CHECK-O3-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[TMP87]], [[OP_RDX_9]]
+; CHECK-O3-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP88:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP89:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP90:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP91:%.*]] = zext <16 x i8> [[TMP90]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP92:%.*]] = sub nsw <16 x i16> [[TMP89]], [[TMP91]]
+; CHECK-O3-NEXT:    [[TMP93:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP92]], i1 false)
+; CHECK-O3-NEXT:    [[TMP94:%.*]] = zext <16 x i16> [[TMP93]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP95:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP94]])
+; CHECK-O3-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[TMP95]], [[OP_RDX_10]]
+; CHECK-O3-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP97:%.*]] = zext <16 x i8> [[TMP96]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP99:%.*]] = zext <16 x i8> [[TMP98]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP100:%.*]] = sub nsw <16 x i16> [[TMP97]], [[TMP99]]
+; CHECK-O3-NEXT:    [[TMP101:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP100]], i1 false)
+; CHECK-O3-NEXT:    [[TMP102:%.*]] = zext <16 x i16> [[TMP101]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP103:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP102]])
+; CHECK-O3-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[TMP103]], [[OP_RDX_11]]
+; CHECK-O3-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP104:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP105:%.*]] = zext <16 x i8> [[TMP104]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP107:%.*]] = zext <16 x i8> [[TMP106]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP108:%.*]] = sub nsw <16 x i16> [[TMP105]], [[TMP107]]
+; CHECK-O3-NEXT:    [[TMP109:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP108]], i1 false)
+; CHECK-O3-NEXT:    [[TMP110:%.*]] = zext <16 x i16> [[TMP109]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP111:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP110]])
+; CHECK-O3-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[TMP111]], [[OP_RDX_12]]
+; CHECK-O3-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP112:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP113:%.*]] = zext <16 x i8> [[TMP112]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP114:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP115:%.*]] = zext <16 x i8> [[TMP114]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP116:%.*]] = sub nsw <16 x i16> [[TMP113]], [[TMP115]]
+; CHECK-O3-NEXT:    [[TMP117:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP116]], i1 false)
+; CHECK-O3-NEXT:    [[TMP118:%.*]] = zext <16 x i16> [[TMP117]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]])
+; CHECK-O3-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[TMP119]], [[OP_RDX_13]]
+; CHECK-O3-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
+; CHECK-O3-NEXT:    [[TMP120:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP121:%.*]] = zext <16 x i8> [[TMP120]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP122:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-O3-NEXT:    [[TMP123:%.*]] = zext <16 x i8> [[TMP122]] to <16 x i16>
+; CHECK-O3-NEXT:    [[TMP124:%.*]] = sub nsw <16 x i16> [[TMP121]], [[TMP123]]
+; CHECK-O3-NEXT:    [[TMP125:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP124]], i1 false)
+; CHECK-O3-NEXT:    [[TMP126:%.*]] = zext <16 x i16> [[TMP125]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP126]])
+; CHECK-O3-NEXT:    [[OP_RDX_15:%.*]] = add i32 [[TMP127]], [[OP_RDX_14]]
+; CHECK-O3-NEXT:    ret i32 [[OP_RDX_15]]
+;
+; CHECK-LTO-LABEL: define dso_local i32 @test(
+; CHECK-LTO-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-LTO-NEXT:  [[ENTRY:.*:]]
+; CHECK-LTO-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
+; CHECK-LTO-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
+; CHECK-LTO-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-LTO-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
+; CHECK-LTO-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP36:%.*]] = zext nneg <16 x i16> [[TMP5]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP44:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP36]])
+; CHECK-LTO-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i16> [[TMP7]], [[TMP9]]
+; CHECK-LTO-NEXT:    [[TMP11:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP10]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP52:%.*]] = zext nneg <16 x i16> [[TMP11]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP60:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP52]])
+; CHECK-LTO-NEXT:    [[OP_RDX_1:%.*]] = add i32 [[TMP60]], [[TMP44]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP16:%.*]] = sub nsw <16 x i16> [[TMP13]], [[TMP15]]
+; CHECK-LTO-NEXT:    [[TMP17:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP16]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP68:%.*]] = zext nneg <16 x i16> [[TMP17]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP76:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP68]])
+; CHECK-LTO-NEXT:    [[OP_RDX_2:%.*]] = add i32 [[OP_RDX_1]], [[TMP76]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP22:%.*]] = sub nsw <16 x i16> [[TMP19]], [[TMP21]]
+; CHECK-LTO-NEXT:    [[TMP23:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP22]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP84:%.*]] = zext nneg <16 x i16> [[TMP23]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP92:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP84]])
+; CHECK-LTO-NEXT:    [[OP_RDX_3:%.*]] = add i32 [[OP_RDX_2]], [[TMP92]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
+; CHECK-LTO-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP100:%.*]] = zext nneg <16 x i16> [[TMP29]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP108:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP100]])
+; CHECK-LTO-NEXT:    [[OP_RDX_4:%.*]] = add i32 [[OP_RDX_3]], [[TMP108]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP34:%.*]] = sub nsw <16 x i16> [[TMP31]], [[TMP33]]
+; CHECK-LTO-NEXT:    [[TMP35:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP34]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP116:%.*]] = zext nneg <16 x i16> [[TMP35]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP117:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP116]])
+; CHECK-LTO-NEXT:    [[OP_RDX_5:%.*]] = add i32 [[OP_RDX_4]], [[TMP117]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP37:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP39:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i16> [[TMP38]], [[TMP40]]
+; CHECK-LTO-NEXT:    [[TMP42:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP43:%.*]] = zext nneg <16 x i16> [[TMP42]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP118:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP43]])
+; CHECK-LTO-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[OP_RDX_5]], [[TMP118]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP45:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP46:%.*]] = zext <16 x i8> [[TMP45]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i16> [[TMP46]], [[TMP48]]
+; CHECK-LTO-NEXT:    [[TMP50:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP49]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP51:%.*]] = zext nneg <16 x i16> [[TMP50]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP120:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP51]])
+; CHECK-LTO-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[OP_RDX_6]], [[TMP120]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP53:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP54:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP55:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i16> [[TMP54]], [[TMP56]]
+; CHECK-LTO-NEXT:    [[TMP58:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP57]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP59:%.*]] = zext nneg <16 x i16> [[TMP58]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP121:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP59]])
+; CHECK-LTO-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[OP_RDX_7]], [[TMP121]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP61:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP62:%.*]] = zext <16 x i8> [[TMP61]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP64:%.*]] = zext <16 x i8> [[TMP63]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP65:%.*]] = sub nsw <16 x i16> [[TMP62]], [[TMP64]]
+; CHECK-LTO-NEXT:    [[TMP66:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP65]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP67:%.*]] = zext nneg <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP122:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP67]])
+; CHECK-LTO-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[OP_RDX_8]], [[TMP122]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP69:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP70:%.*]] = zext <16 x i8> [[TMP69]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP71:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP72:%.*]] = zext <16 x i8> [[TMP71]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP73:%.*]] = sub nsw <16 x i16> [[TMP70]], [[TMP72]]
+; CHECK-LTO-NEXT:    [[TMP74:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP73]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP75:%.*]] = zext nneg <16 x i16> [[TMP74]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP123:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP75]])
+; CHECK-LTO-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[OP_RDX_9]], [[TMP123]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP78:%.*]] = zext <16 x i8> [[TMP77]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP79:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP80:%.*]] = zext <16 x i8> [[TMP79]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP81:%.*]] = sub nsw <16 x i16> [[TMP78]], [[TMP80]]
+; CHECK-LTO-NEXT:    [[TMP82:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP81]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP83:%.*]] = zext nneg <16 x i16> [[TMP82]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP124:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP83]])
+; CHECK-LTO-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[OP_RDX_10]], [[TMP124]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP86:%.*]] = zext <16 x i8> [[TMP85]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP87:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP88:%.*]] = zext <16 x i8> [[TMP87]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP89:%.*]] = sub nsw <16 x i16> [[TMP86]], [[TMP88]]
+; CHECK-LTO-NEXT:    [[TMP90:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP89]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP91:%.*]] = zext nneg <16 x i16> [[TMP90]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP125:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP91]])
+; CHECK-LTO-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[OP_RDX_11]], [[TMP125]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP93:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP94:%.*]] = zext <16 x i8> [[TMP93]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP95:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP96:%.*]] = zext <16 x i8> [[TMP95]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP97:%.*]] = sub nsw <16 x i16> [[TMP94]], [[TMP96]]
+; CHECK-LTO-NEXT:    [[TMP98:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP97]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP99:%.*]] = zext nneg <16 x i16> [[TMP98]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP126:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP99]])
+; CHECK-LTO-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[OP_RDX_12]], [[TMP126]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP101:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP102:%.*]] = zext <16 x i8> [[TMP101]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP103:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP104:%.*]] = zext <16 x i8> [[TMP103]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP105:%.*]] = sub nsw <16 x i16> [[TMP102]], [[TMP104]]
+; CHECK-LTO-NEXT:    [[TMP106:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP105]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP107:%.*]] = zext nneg <16 x i16> [[TMP106]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP107]])
+; CHECK-LTO-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[OP_RDX_13]], [[TMP119]]
+; CHECK-LTO-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
+; CHECK-LTO-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
+; CHECK-LTO-NEXT:    [[TMP109:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP110:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP111:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[TBAA0]]
+; CHECK-LTO-NEXT:    [[TMP112:%.*]] = zext <16 x i8> [[TMP111]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[TMP113:%.*]] = sub nsw <16 x i16> [[TMP110]], [[TMP112]]
+; CHECK-LTO-NEXT:    [[TMP114:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP113]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP115:%.*]] = zext nneg <16 x i16> [[TMP114]] to <16 x i32>
+; CHECK-LTO-NEXT:    [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP115]])
+; CHECK-LTO-NEXT:    [[OP_RDX_15:%.*]] = add i32 [[OP_RDX_14]], [[TMP127]]
+; CHECK-LTO-NEXT:    ret i32 [[OP_RDX_15]]
+;
+entry:
+  %p1.addr = alloca ptr, align 8
+  %s_p1.addr = alloca i32, align 4
+  %p2.addr = alloca ptr, align 8
+  %s_p2.addr = alloca i32, align 4
+  %i_sum = alloca i32, align 4
+  %y = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %x = alloca i32, align 4
+  store ptr %p1, ptr %p1.addr, align 8, !tbaa !4
+  store i32 %s_p1, ptr %s_p1.addr, align 4, !tbaa !9
+  store ptr %p2, ptr %p2.addr, align 8, !tbaa !4
+  store i32 %s_p2, ptr %s_p2.addr, align 4, !tbaa !9
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i_sum) #3
+  store i32 0, ptr %i_sum, align 4, !tbaa !9
+  call void @llvm.lifetime.start.p0(i64 4, ptr %y) #3
+  store i32 0, ptr %y, align 4, !tbaa !9
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %0 = load i32, ptr %y, align 4, !tbaa !9
+  %cmp = icmp slt i32 %0, 16
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  store i32 2, ptr %cleanup.dest.slot, align 4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %y) #3
+  br label %for.end12
+
+for.body:                                         ; preds = %for.cond
+  call void @llvm.lifetime.start.p0(i64 4, ptr %x) #3
+  store i32 0, ptr %x, align 4, !tbaa !9
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %1 = load i32, ptr %x, align 4, !tbaa !9
+  %cmp2 = icmp slt i32 %1, 16
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+
+for.cond.cleanup3:                                ; preds = %for.cond1
+  store i32 5, ptr %cleanup.dest.slot, align 4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %x) #3
+  br label %for.end
+
+for.body4:                                        ; preds = %for.cond1
+  %2 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %3 = load i32, ptr %x, align 4, !tbaa !9
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i8, ptr %2, i64 %idxprom
+  %4 = load i8, ptr %arrayidx, align 1, !tbaa !11
+  %conv = zext i8 %4 to i32
+  %5 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %6 = load i32, ptr %x, align 4, !tbaa !9
+  %idxprom5 = sext i32 %6 to i64
+  %arrayidx6 = getelementptr inbounds i8, ptr %5, i64 %idxprom5
+  %7 = load i8, ptr %arrayidx6, align 1, !tbaa !11
+  %conv7 = zext i8 %7 to i32
+  %sub = sub nsw i32 %conv, %conv7
+  %8 = call i32 @llvm.abs.i32(i32 %sub, i1 true)
+  %9 = load i32, ptr %i_sum, align 4, !tbaa !9
+  %add = add nsw i32 %9, %8
+  store i32 %add, ptr %i_sum, align 4, !tbaa !9
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body4
+  %10 = load i32, ptr %x, align 4, !tbaa !9
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, ptr %x, align 4, !tbaa !9
+  br label %for.cond1, !llvm.loop !12
+
+for.end:                                          ; preds = %for.cond.cleanup3
+  %11 = load i32, ptr %s_p1.addr, align 4, !tbaa !9
+  %12 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %idx.ext = sext i32 %11 to i64
+  %add.ptr = getelementptr inbounds i8, ptr %12, i64 %idx.ext
+  store ptr %add.ptr, ptr %p1.addr, align 8, !tbaa !4
+  %13 = load i32, ptr %s_p2.addr, align 4, !tbaa !9
+  %14 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %idx.ext8 = sext i32 %13 to i64
+  %add.ptr9 = getelementptr inbounds i8, ptr %14, i64 %idx.ext8
+  store ptr %add.ptr9, ptr %p2.addr, align 8, !tbaa !4
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end
+  %15 = load i32, ptr %y, align 4, !tbaa !9
+  %inc11 = add nsw i32 %15, 1
+  store i32 %inc11, ptr %y, align 4, !tbaa !9
+  br label %for.cond, !llvm.loop !14
+
+for.end12:                                        ; preds = %for.cond.cleanup
+  %16 = load i32, ptr %i_sum, align 4, !tbaa !9
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i_sum) #3
+  ret i32 %16
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.abs.i32(i32, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr) #1
+
+attributes #0 = { nounwind uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+rand,+ras,+rcpc,+rdm,+sb,+spe,+ssbs,+sve,+sve-bitperm,+sve2,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind }
+
+
+!4 = !{!5, !5, i64 0}
+!5 = !{!"p1 omnipotent char", !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !7, i64 0}
+!11 = !{!7, !7, i64 0}
+!12 = distinct !{!12, !13}
+!13 = !{!"llvm.loop.mustprogress"}
+!14 = distinct !{!14, !13}
+;.
+; CHECK-O3: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-O3: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
+; CHECK-O3: [[META2]] = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK-LTO: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-LTO: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
+; CHECK-LTO: [[META2]] = !{!"Simple C/C++ TBAA"}
+;.


        


More information about the llvm-commits mailing list