[clang] [llvm] [VectorCombine] Shrink loads used in shufflevector rebroadcasts (PR #128938)
Leon Clark via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 26 05:37:58 PDT 2025
https://github.com/PeddleSpam updated https://github.com/llvm/llvm-project/pull/128938
>From f1c09277af268256fce71df9a858959b69385ef1 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 26 Feb 2025 15:59:02 +0000
Subject: [PATCH 01/11] [AggressiveInstCombine] Shrink loads used in
shufflevector rebroadcasts.
Attempt to shrink the size of vector loads where only some of the incoming lanes are used for rebroadcasts in shufflevector instructions.
---
.../load-shufflevector.ll | 345 ++++++++++++++++++
1 file changed, 345 insertions(+)
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
new file mode 100644
index 0000000000000..3f6c8334e61cf
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
@@ -0,0 +1,345 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s
+
+define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x half> %val1
+}
+
+define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x half> %val1
+}
+
+define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: ret <4 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ ret <4 x half> %val1
+}
+
+define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x half> %val1
+}
+
+define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x half> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x half> %val3
+}
+
+define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <4 x half> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <4 x half> [ %val1, %then ], [ %val2, %else ]
+ ret <4 x half> %val3
+}
+
+define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x half> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x half> %val3
+}
+
+define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: ret <8 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %val1
+}
+
+define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i32> %val1
+}
+
+define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: ret <4 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ ret <4 x i32> %val1
+}
+
+define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i32> %val1
+}
+
+define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x i32> %val3
+}
+
+define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x i32> %val3
+}
+
+define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <4 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <4 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <4 x i32> %val3
+}
+
+define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x i32> %val3
+}
>From b8ec65331def0fce1f70d203f3473accbdd77865 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 26 Feb 2025 21:18:30 +0000
Subject: [PATCH 02/11] Add implementation and update tests.
---
clang/test/CodeGenOpenCL/preserve_vec3.cl | 20 ++---
.../AggressiveInstCombine.cpp | 90 +++++++++++++++++++
.../load-shufflevector.ll | 88 +++++++++---------
3 files changed, 144 insertions(+), 54 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 49ebae6fc7013..0538eac4029bb 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index fe7b3b1676e08..cbdf99316e9e3 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -915,6 +915,95 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
return true;
}
+// If `I` is a load instruction, used only by shufflevector instructions with
+// poison values, attempt to shrink the load to only the lanes being used.
+static bool shrinkLoadsForBroadcast(Instruction &I) {
+ auto *OldLoad = dyn_cast<LoadInst>(&I);
+ if (!OldLoad)
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!VecTy)
+ return false;
+
+ auto IsPoisonOrUndef = [](Value *V) -> bool {
+ if (auto *C = dyn_cast<Constant>(V)) {
+ return isa<PoisonValue>(C) || isa<UndefValue>(C);
+ }
+ return false;
+ };
+
+ using IndexRange = std::pair<unsigned, unsigned>;
+ auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+ auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
+ for (auto &Use: I.uses()) {
+ // All uses must be ShuffleVector instructions.
+ auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
+ if (!Shuffle)
+ return {};
+
+ // Get index range for value.
+ auto *Op0 = Shuffle->getOperand(0u);
+ auto *Op1 = Shuffle->getOperand(1u);
+ if (!IsPoisonOrUndef(Op1))
+ return {};
+
+ // Find the min and max indices used by the ShuffleVector instruction.
+ auto Mask = Shuffle->getShuffleMask();
+ auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
+ auto NumElems = Op0Ty->getNumElements();
+
+ for (unsigned Index: Mask) {
+ if (Index < NumElems) {
+ OutputRange.first = std::min(Index, OutputRange.first);
+ OutputRange.second = std::max(Index, OutputRange.second);
+ }
+ }
+ }
+ return OutputRange;
+ };
+
+ if (auto Indices = GetIndexRangeInShuffles()) {
+ auto OldSize = VecTy->getNumElements();
+ auto NewSize = Indices->second + 1u;
+
+ if (NewSize < OldSize) {
+ auto Builder = IRBuilder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ // Create new load of smaller vector.
+ auto *ElemTy = VecTy->getElementType();
+ auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+ auto *NewLoad = cast<LoadInst>(
+ Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ NewLoad->copyMetadata(I);
+
+ // Replace all users.
+ auto OldShuffles = SmallVector<ShuffleVectorInst*, 4u>{};
+ for (auto &Use: I.uses()) {
+ auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+
+ Builder.SetInsertPoint(Shuffle);
+ Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+ auto *NewShuffle = Builder.CreateShuffleVector(
+ NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()
+ );
+
+ Shuffle->replaceAllUsesWith(NewShuffle);
+ OldShuffles.push_back(Shuffle);
+ }
+
+ // Erase old users.
+ for (auto *Shuffle: OldShuffles)
+ Shuffle->eraseFromParent();
+
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ return false;
+}
+
namespace {
class StrNCmpInliner {
public:
@@ -1251,6 +1340,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
MadeChange |= tryToRecognizeTableBasedCttz(I);
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
MadeChange |= foldPatternedLoads(I, DL);
+ MadeChange |= shrinkLoadsForBroadcast(I);
// NOTE: This function introduces erasing of the instruction `I`, so it
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
index 3f6c8334e61cf..57006f2c65380 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
@@ -5,8 +5,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -33,8 +33,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
entry:
@@ -47,8 +47,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -61,16 +61,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -94,16 +94,16 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x half> [[VAL3]]
;
entry:
@@ -127,16 +127,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -160,8 +160,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -188,8 +188,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
entry:
@@ -202,8 +202,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -216,16 +216,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -249,16 +249,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -282,16 +282,16 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x i32> [[VAL3]]
;
entry:
@@ -315,16 +315,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
>From 0f9aada48f0cc08726a0e3f79ab78bead7991d3e Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 28 Feb 2025 17:25:35 +0000
Subject: [PATCH 03/11] Fix broken tests.
---
.../builtins-systemz-zvector-constrained.c | 4 +-
.../SystemZ/builtins-systemz-zvector.c | 52 +++++++++----------
.../builtins-systemz-zvector2-constrained.c | 12 ++---
.../SystemZ/builtins-systemz-zvector2.c | 12 ++---
.../AggressiveInstCombine.cpp | 19 ++++---
5 files changed, 49 insertions(+), 50 deletions(-)
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
index 4993df20df143..e335c363ecb48 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
@@ -79,8 +79,8 @@ void test_core(void) {
vec_xstd2(vd, idx, ptrd);
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index d5d15b4dea966..422c97a77511c 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -777,80 +777,80 @@ void test_core(void) {
// CHECK: <2 x i64> splat (i64 -4503582447501313)
vsc = vec_splat(vsc, 0);
- // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vrepb
+ // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vlrepb
vsc = vec_splat(vsc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vuc = vec_splat(vuc, 0);
- // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vrepb
+ // CHECK: store volatile <16 x i8> splat (i8 {{.*}}), ptr @vuc
+ // CHECK-ASM: vst
vuc = vec_splat(vuc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vbc = vec_splat(vbc, 0);
- // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vrepb
+ // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vlrepb
vbc = vec_splat(vbc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vss = vec_splat(vss, 0);
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vreph
+ // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vlreph
vss = vec_splat(vss, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vus = vec_splat(vus, 0);
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vreph
+ // CHECK: store volatile <8 x i16> splat (i16 {{.*}}), ptr @vus
+ // CHECK-ASM: vst
vus = vec_splat(vus, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vbs = vec_splat(vbs, 0);
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vreph
+ // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vlreph
vbs = vec_splat(vbs, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vsi = vec_splat(vsi, 0);
- // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vsi = vec_splat(vsi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vui = vec_splat(vui, 0);
- // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: store volatile <4 x i32> splat (i32 {{.*}}), ptr @vui
+ // CHECK-ASM: vst
vui = vec_splat(vui, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vbi = vec_splat(vbi, 0);
- // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vbi = vec_splat(vbi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vsl = vec_splat(vsl, 0);
- // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
// CHECK-ASM: vrepg
vsl = vec_splat(vsl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
- // CHECK-ASM: vrepg
+ // CHECK-ASM: vst
vul = vec_splat(vul, 0);
- // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: store volatile <2 x i64> splat (i64 {{.*}}), ptr @vul
+ // CHECK-ASM: vst
vul = vec_splat(vul, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vbl = vec_splat(vbl, 0);
- // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vbl = vec_splat(vbl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
index 25b3e0b68cd02..2b79df2a1886e 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
@@ -130,14 +130,14 @@ void test_core(void) {
// CHECK-ASM: vst
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vst
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
index c1ef178fcfaa9..1ccbe6df5f16d 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
@@ -254,14 +254,14 @@ void test_core(void) {
// CHECK-ASM: vstrlr
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vst
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index cbdf99316e9e3..813935499f86e 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -915,7 +915,7 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
return true;
}
-// If `I` is a load instruction, used only by shufflevector instructions with
+// If `I` is a load instruction, used only by shufflevector instructions with
// poison values, attempt to shrink the load to only the lanes being used.
static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *OldLoad = dyn_cast<LoadInst>(&I);
@@ -936,7 +936,7 @@ static bool shrinkLoadsForBroadcast(Instruction &I) {
using IndexRange = std::pair<unsigned, unsigned>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
- for (auto &Use: I.uses()) {
+ for (auto &Use : I.uses()) {
// All uses must be ShuffleVector instructions.
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
if (!Shuffle)
@@ -953,7 +953,7 @@ static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
auto NumElems = Op0Ty->getNumElements();
- for (unsigned Index: Mask) {
+ for (unsigned Index : Mask) {
if (Index < NumElems) {
OutputRange.first = std::min(Index, OutputRange.first);
OutputRange.second = std::max(Index, OutputRange.second);
@@ -975,26 +975,25 @@ static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *ElemTy = VecTy->getElementType();
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
auto *NewLoad = cast<LoadInst>(
- Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
NewLoad->copyMetadata(I);
// Replace all users.
- auto OldShuffles = SmallVector<ShuffleVectorInst*, 4u>{};
- for (auto &Use: I.uses()) {
+ auto OldShuffles = SmallVector<ShuffleVectorInst *, 4u>{};
+ for (auto &Use : I.uses()) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
-
+
Builder.SetInsertPoint(Shuffle);
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
auto *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()
- );
+ NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
Shuffle->replaceAllUsesWith(NewShuffle);
OldShuffles.push_back(Shuffle);
}
// Erase old users.
- for (auto *Shuffle: OldShuffles)
+ for (auto *Shuffle : OldShuffles)
Shuffle->eraseFromParent();
I.eraseFromParent();
>From d294e75b2a8b665bee90fde75c501d9815257a30 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 4 Mar 2025 17:44:56 +0000
Subject: [PATCH 04/11] Ignore non-simple loads.
---
.../builtins-systemz-zvector-constrained.c | 4 +-
.../SystemZ/builtins-systemz-zvector.c | 52 +++++++++----------
.../builtins-systemz-zvector2-constrained.c | 12 ++---
.../SystemZ/builtins-systemz-zvector2.c | 12 ++---
.../AggressiveInstCombine.cpp | 2 +-
.../load-shufflevector.ll | 14 +++++
6 files changed, 55 insertions(+), 41 deletions(-)
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
index e335c363ecb48..4993df20df143 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
@@ -79,8 +79,8 @@ void test_core(void) {
vec_xstd2(vd, idx, ptrd);
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index 422c97a77511c..d5d15b4dea966 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -777,80 +777,80 @@ void test_core(void) {
// CHECK: <2 x i64> splat (i64 -4503582447501313)
vsc = vec_splat(vsc, 0);
- // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vlrepb
+ // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vrepb
vsc = vec_splat(vsc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vuc = vec_splat(vuc, 0);
- // CHECK: store volatile <16 x i8> splat (i8 {{.*}}), ptr @vuc
- // CHECK-ASM: vst
+ // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vrepb
vuc = vec_splat(vuc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vbc = vec_splat(vbc, 0);
- // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vlrepb
+ // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vrepb
vbc = vec_splat(vbc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vss = vec_splat(vss, 0);
- // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vlreph
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vreph
vss = vec_splat(vss, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vus = vec_splat(vus, 0);
- // CHECK: store volatile <8 x i16> splat (i16 {{.*}}), ptr @vus
- // CHECK-ASM: vst
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vreph
vus = vec_splat(vus, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vbs = vec_splat(vbs, 0);
- // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vlreph
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vreph
vbs = vec_splat(vbs, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vsi = vec_splat(vsi, 0);
- // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vsi = vec_splat(vsi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vui = vec_splat(vui, 0);
- // CHECK: store volatile <4 x i32> splat (i32 {{.*}}), ptr @vui
- // CHECK-ASM: vst
+ // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vui = vec_splat(vui, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vbi = vec_splat(vbi, 0);
- // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vbi = vec_splat(vbi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vsl = vec_splat(vsl, 0);
- // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
// CHECK-ASM: vrepg
vsl = vec_splat(vsl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
- // CHECK-ASM: vst
+ // CHECK-ASM: vrepg
vul = vec_splat(vul, 0);
- // CHECK: store volatile <2 x i64> splat (i64 {{.*}}), ptr @vul
- // CHECK-ASM: vst
+ // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vul = vec_splat(vul, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vbl = vec_splat(vbl, 0);
- // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vbl = vec_splat(vbl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
index 2b79df2a1886e..25b3e0b68cd02 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
@@ -130,14 +130,14 @@ void test_core(void) {
// CHECK-ASM: vst
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vst
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vrepf
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
index 1ccbe6df5f16d..c1ef178fcfaa9 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
@@ -254,14 +254,14 @@ void test_core(void) {
// CHECK-ASM: vstrlr
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vst
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vrepf
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 813935499f86e..904fc6cd05170 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -919,7 +919,7 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
// poison values, attempt to shrink the load to only the lanes being used.
static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad)
+ if (!OldLoad || !OldLoad->isSimple())
return false;
auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
index 57006f2c65380..9978c15e90beb 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
@@ -1,6 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s
+define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load volatile <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x half> %val1
+}
+
define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
>From 54e9d078ff8848810c3da24ea26e7ef0f3a2e760 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 12 Mar 2025 00:21:08 +0000
Subject: [PATCH 05/11] Move transform to VectorCombine and update tests.
---
.../AggressiveInstCombine.cpp | 89 -----------------
.../Transforms/Vectorize/VectorCombine.cpp | 99 +++++++++++++++++++
.../PhaseOrdering/X86/vec-load-combine.ll | 8 +-
.../VectorCombine/X86/load-widening.ll | 6 +-
.../VectorCombine/X86/shuffle-of-shuffles.ll | 24 ++---
.../load-shufflevector.ll | 2 +-
6 files changed, 116 insertions(+), 112 deletions(-)
rename llvm/test/Transforms/{AggressiveInstCombine => VectorCombine}/load-shufflevector.ll (99%)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 904fc6cd05170..fe7b3b1676e08 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -915,94 +915,6 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
return true;
}
-// If `I` is a load instruction, used only by shufflevector instructions with
-// poison values, attempt to shrink the load to only the lanes being used.
-static bool shrinkLoadsForBroadcast(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad || !OldLoad->isSimple())
- return false;
-
- auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
- if (!VecTy)
- return false;
-
- auto IsPoisonOrUndef = [](Value *V) -> bool {
- if (auto *C = dyn_cast<Constant>(V)) {
- return isa<PoisonValue>(C) || isa<UndefValue>(C);
- }
- return false;
- };
-
- using IndexRange = std::pair<unsigned, unsigned>;
- auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
- for (auto &Use : I.uses()) {
- // All uses must be ShuffleVector instructions.
- auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
- if (!Shuffle)
- return {};
-
- // Get index range for value.
- auto *Op0 = Shuffle->getOperand(0u);
- auto *Op1 = Shuffle->getOperand(1u);
- if (!IsPoisonOrUndef(Op1))
- return {};
-
- // Find the min and max indices used by the ShuffleVector instruction.
- auto Mask = Shuffle->getShuffleMask();
- auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
- auto NumElems = Op0Ty->getNumElements();
-
- for (unsigned Index : Mask) {
- if (Index < NumElems) {
- OutputRange.first = std::min(Index, OutputRange.first);
- OutputRange.second = std::max(Index, OutputRange.second);
- }
- }
- }
- return OutputRange;
- };
-
- if (auto Indices = GetIndexRangeInShuffles()) {
- auto OldSize = VecTy->getNumElements();
- auto NewSize = Indices->second + 1u;
-
- if (NewSize < OldSize) {
- auto Builder = IRBuilder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- // Create new load of smaller vector.
- auto *ElemTy = VecTy->getElementType();
- auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
- auto *NewLoad = cast<LoadInst>(
- Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
- NewLoad->copyMetadata(I);
-
- // Replace all users.
- auto OldShuffles = SmallVector<ShuffleVectorInst *, 4u>{};
- for (auto &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
-
- Builder.SetInsertPoint(Shuffle);
- Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
- auto *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
-
- Shuffle->replaceAllUsesWith(NewShuffle);
- OldShuffles.push_back(Shuffle);
- }
-
- // Erase old users.
- for (auto *Shuffle : OldShuffles)
- Shuffle->eraseFromParent();
-
- I.eraseFromParent();
- return true;
- }
- }
- return false;
-}
-
namespace {
class StrNCmpInliner {
public:
@@ -1339,7 +1251,6 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
MadeChange |= tryToRecognizeTableBasedCttz(I);
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
MadeChange |= foldPatternedLoads(I, DL);
- MadeChange |= shrinkLoadsForBroadcast(I);
// NOTE: This function introduces erasing of the instruction `I`, so it
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2e20961b4c912..fcb3f03f93af4 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -129,6 +129,7 @@ class VectorCombine {
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
+ bool shrinkLoadForShuffles(Instruction &I);
void replaceValue(Value &Old, Value &New) {
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3398,6 +3399,101 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
+// If `I` is a load instruction, used only by shufflevector instructions with
+// poison values, attempt to shrink the load to only the lanes being used.
+bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
+ auto *OldLoad = dyn_cast<LoadInst>(&I);
+ if (!OldLoad || !OldLoad->isSimple())
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!VecTy)
+ return false;
+
+ auto IsPoisonOrUndef = [](Value *V) -> bool {
+ if (auto *C = dyn_cast<Constant>(V)) {
+ return isa<PoisonValue>(C) || isa<UndefValue>(C);
+ }
+ return false;
+ };
+
+ using IndexRange = std::pair<int, int>;
+ auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+ auto OutputRange = IndexRange(VecTy->getNumElements(), -1);
+ for (auto &Use : I.uses()) {
+ // All uses must be ShuffleVector instructions.
+ auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
+ if (!Shuffle)
+ return {};
+
+ // Get index range for value.
+ auto *Op0 = Shuffle->getOperand(0u);
+ auto *Op1 = Shuffle->getOperand(1u);
+ if (!IsPoisonOrUndef(Op1))
+ return {};
+
+ // Find the min and max indices used by the ShuffleVector instruction.
+ auto Mask = Shuffle->getShuffleMask();
+ auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
+ auto NumElems = int(Op0Ty->getNumElements());
+
+ for (auto Index : Mask) {
+ if (Index >= 0 && Index < NumElems) {
+ OutputRange.first = std::min(Index, OutputRange.first);
+ OutputRange.second = std::max(Index, OutputRange.second);
+ }
+ }
+
+ if (OutputRange.second < OutputRange.first)
+ return {};
+ }
+ return OutputRange;
+ };
+
+ if (auto Indices = GetIndexRangeInShuffles()) {
+ auto OldSize = VecTy->getNumElements();
+ auto NewSize = Indices->second + 1u;
+
+ if (NewSize < OldSize) {
+ auto Builder = IRBuilder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ // Create new load of smaller vector.
+ auto *ElemTy = VecTy->getElementType();
+ auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+ auto *NewLoad = cast<LoadInst>(
+ Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ NewLoad->copyMetadata(I);
+
+ // Compare cost of old and new loads.
+ auto OldCost = TTI.getMemoryOpCost(
+ Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
+ OldLoad->getPointerAddressSpace(), CostKind);
+ auto NewCost = TTI.getMemoryOpCost(
+ Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
+ NewLoad->getPointerAddressSpace(), CostKind);
+
+ if (OldCost < NewCost || !NewCost.isValid())
+ return false;
+
+ // Replace all users.
+ for (auto &Use : I.uses()) {
+ auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+
+ Builder.SetInsertPoint(Shuffle);
+ Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+ auto *NewShuffle = Builder.CreateShuffleVector(
+ NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
+
+ replaceValue(*Shuffle, *NewShuffle);
+ }
+
+ return true;
+ }
+ }
+ return false;
+}
+
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -3475,6 +3571,9 @@ bool VectorCombine::run() {
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
+ case Instruction::Load:
+ MadeChange |= shrinkLoadForShuffles(I);
+ break;
default:
MadeChange |= shrinkType(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 85f6fceb5bdbe..9218cc2d019f8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index 30a089818074e..07bd3966e8202 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -336,7 +336,7 @@ define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) {
define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32(
-; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index b30dc9ffdc596..9a051361e2081 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
-; SSE-NEXT: ret <4 x double> [[BLEND]]
-;
-; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT: ret <4 x double> [[BLEND]]
+; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 8
+; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
similarity index 99%
rename from llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
rename to llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 9978c15e90beb..44724a5f1f127 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s
+; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
>From 9033d216a488a5bed70edc094a29251ed5d90916 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 14 Mar 2025 17:40:27 +0000
Subject: [PATCH 06/11] Address review comments and update tests.
---
.../Transforms/Vectorize/VectorCombine.cpp | 26 ++++++++++++----
.../VectorCombine/X86/load-widening.ll | 4 +--
.../VectorCombine/X86/shuffle-of-shuffles.ll | 4 +--
.../VectorCombine/load-shufflevector.ll | 30 +++++++++----------
4 files changed, 39 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fcb3f03f93af4..6951f3ff484c2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3443,10 +3443,11 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
OutputRange.second = std::max(Index, OutputRange.second);
}
}
-
- if (OutputRange.second < OutputRange.first)
- return {};
}
+
+ if (OutputRange.second < OutputRange.first)
+ return {};
+
return OutputRange;
};
@@ -3461,11 +3462,12 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
// Create new load of smaller vector.
auto *ElemTy = VecTy->getElementType();
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+ auto *PtrOp = OldLoad->getPointerOperand();
auto *NewLoad = cast<LoadInst>(
- Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign()));
NewLoad->copyMetadata(I);
- // Compare cost of old and new loads.
+ // Compare cost of old and new ops.
auto OldCost = TTI.getMemoryOpCost(
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
@@ -3473,8 +3475,20 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
NewLoad->getPointerAddressSpace(), CostKind);
- if (OldCost < NewCost || !NewCost.isValid())
+ for (auto &Use : I.uses()) {
+ auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+ auto Mask = Shuffle->getShuffleMask();
+
+ OldCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
+ NewCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, NewVecTy, Mask, CostKind);
+ }
+
+ if (OldCost < NewCost || !NewCost.isValid()) {
+ NewLoad->eraseFromParent();
return false;
+ }
// Replace all users.
for (auto &Use : I.uses()) {
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index 07bd3966e8202..eacc40bfa9b53 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -336,7 +336,7 @@ define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) {
define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32(
-; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
@@ -443,7 +443,7 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index 9a051361e2081..eddfc57a7d256 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -49,8 +49,8 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 44724a5f1f127..3e302c5f43032 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -19,7 +19,7 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
@@ -33,7 +33,7 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
@@ -47,7 +47,7 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
@@ -61,7 +61,7 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
@@ -75,7 +75,7 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
@@ -108,7 +108,7 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -141,7 +141,7 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -174,7 +174,7 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
@@ -188,7 +188,7 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
@@ -202,7 +202,7 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
@@ -216,7 +216,7 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
@@ -230,7 +230,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
@@ -263,7 +263,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
@@ -296,7 +296,7 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -329,7 +329,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
>From 5f16d8b3ef8616678b2b723b23ef36919f015d79 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 14 Mar 2025 17:53:21 +0000
Subject: [PATCH 07/11] Code formatting.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 6951f3ff484c2..27f1e17d88235 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3479,10 +3479,10 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
auto Mask = Shuffle->getShuffleMask();
- OldCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
- NewCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, NewVecTy, Mask, CostKind);
+ OldCost +=
+ TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
+ NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, Mask,
+ CostKind);
}
if (OldCost < NewCost || !NewCost.isValid()) {
>From a0a089661cc45f5ba454d6fb483315d71d244cbd Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 21 Mar 2025 05:23:37 +0000
Subject: [PATCH 08/11] Add cost analysis for shufflevector ops and update
tests.
---
clang/test/CodeGenOpenCL/preserve_vec3.cl | 20 ++---
.../Transforms/Vectorize/VectorCombine.cpp | 52 +++++++----
.../PhaseOrdering/X86/vec-load-combine.ll | 8 +-
.../VectorCombine/X86/load-inseltpoison.ll | 4 +-
.../VectorCombine/X86/load-widening.ll | 4 +-
.../VectorCombine/X86/shuffle-of-shuffles.ll | 24 +++--
.../VectorCombine/load-shufflevector.ll | 88 +++++++++----------
7 files changed, 111 insertions(+), 89 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 0538eac4029bb..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 27f1e17d88235..00caaac00e286 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -33,6 +34,7 @@
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
#include <queue>
+#include <tuple>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -3399,10 +3401,13 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
-// If `I` is a load instruction, used only by shufflevector instructions with
-// poison values, attempt to shrink the load to only the lanes being used.
+// Attempt to shrink loads that are only used by shufflevector instructions.
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
+ auto *InputShuffle = dyn_cast<ShuffleVectorInst>(&I);
+ if (!InputShuffle)
+ return {};
+
+ auto *OldLoad = dyn_cast<LoadInst>(InputShuffle->getOperand(0u));
if (!OldLoad || !OldLoad->isSimple())
return false;
@@ -3438,7 +3443,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto NumElems = int(Op0Ty->getNumElements());
for (auto Index : Mask) {
- if (Index >= 0 && Index < NumElems) {
+ if (Index >= 0) {
+ Index %= NumElems;
OutputRange.first = std::min(Index, OutputRange.first);
OutputRange.second = std::max(Index, OutputRange.second);
}
@@ -3467,7 +3473,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign()));
NewLoad->copyMetadata(I);
- // Compare cost of old and new ops.
+ // Calculate costs of old and new ops.
auto OldCost = TTI.getMemoryOpCost(
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
@@ -3475,14 +3481,25 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
NewLoad->getPointerAddressSpace(), CostKind);
+ using UseEntry = std::pair<ShuffleVectorInst*, std::vector<int>>;
+ auto NewUses = SmallVector<UseEntry, 4u>();
+ auto SizeDiff = OldSize - NewSize;
+
for (auto &Use : I.uses()) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
- auto Mask = Shuffle->getShuffleMask();
-
- OldCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
- NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, Mask,
- CostKind);
+ auto OldMask = Shuffle->getShuffleMask();
+
+ // Create entry for new use.
+ NewUses.push_back({Shuffle, {}});
+ auto &NewMask = NewUses.back().second;
+ for (auto Index : OldMask)
+ NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index);
+
+ // Update costs.
+ OldCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, VecTy, OldMask, CostKind);
+ NewCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, NewVecTy, NewMask, CostKind);
}
if (OldCost < NewCost || !NewCost.isValid()) {
@@ -3490,14 +3507,15 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
return false;
}
- // Replace all users.
- for (auto &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+ // Replace all uses.
+ for (auto &Use : NewUses) {
+ auto *Shuffle = Use.first;
+ auto &NewMask = Use.second;
Builder.SetInsertPoint(Shuffle);
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
auto *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
+ NewLoad, PoisonValue::get(NewVecTy), NewMask);
replaceValue(*Shuffle, *NewShuffle);
}
@@ -3581,13 +3599,11 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
+ MadeChange |= shrinkLoadForShuffles(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
- case Instruction::Load:
- MadeChange |= shrinkLoadForShuffles(I);
- break;
default:
MadeChange |= shrinkType(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 73476308916fb..70197a756c8b3 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -577,8 +577,8 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
; CHECK-NEXT: [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
; CHECK-NEXT: store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[L]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P]], align 4
+; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%l = load <1 x i32>, ptr %p, align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT: ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT: ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 3e302c5f43032..56c51ce7315cf 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -33,8 +33,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -47,8 +47,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
entry:
@@ -61,8 +61,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -75,16 +75,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -108,16 +108,16 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x half> [[VAL3]]
;
entry:
@@ -141,16 +141,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -188,8 +188,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -202,8 +202,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
entry:
@@ -216,8 +216,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -230,16 +230,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -263,16 +263,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -296,16 +296,16 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x i32> [[VAL3]]
;
entry:
@@ -329,16 +329,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
>From b50f692aea8b3773abc78941d6f6d11b0121c465 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 21 Mar 2025 05:29:14 +0000
Subject: [PATCH 09/11] Code formatting.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 00caaac00e286..6c487fbd06342 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3481,7 +3481,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
NewLoad->getPointerAddressSpace(), CostKind);
- using UseEntry = std::pair<ShuffleVectorInst*, std::vector<int>>;
+ using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
auto NewUses = SmallVector<UseEntry, 4u>();
auto SizeDiff = OldSize - NewSize;
@@ -3496,10 +3496,10 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index);
// Update costs.
- OldCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, VecTy, OldMask, CostKind);
- NewCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, NewVecTy, NewMask, CostKind);
+ OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, OldMask,
+ CostKind);
+ NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy,
+ NewMask, CostKind);
}
if (OldCost < NewCost || !NewCost.isValid()) {
>From 58868e4a4ddba8ee5aa0beb2f863515013bff902 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Mon, 24 Mar 2025 23:55:57 +0000
Subject: [PATCH 10/11] Address comments.
---
.../Transforms/Vectorize/VectorCombine.cpp | 38 +++++++++----------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 6c487fbd06342..abe16f362c986 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3405,7 +3405,7 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto *InputShuffle = dyn_cast<ShuffleVectorInst>(&I);
if (!InputShuffle)
- return {};
+ return false;
auto *OldLoad = dyn_cast<LoadInst>(InputShuffle->getOperand(0u));
if (!OldLoad || !OldLoad->isSimple())
@@ -3415,34 +3415,31 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
if (!VecTy)
return false;
- auto IsPoisonOrUndef = [](Value *V) -> bool {
- if (auto *C = dyn_cast<Constant>(V)) {
- return isa<PoisonValue>(C) || isa<UndefValue>(C);
- }
- return false;
- };
-
+ // Search all uses of `I`. If all uses are shufflevector ops, and the second
+ // operands are all poison values, find the minimum and maximum indices of
+ // the vector elements referenced by all shuffle masks.
+ // Otherwise return `std::nullopt`.
using IndexRange = std::pair<int, int>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- auto OutputRange = IndexRange(VecTy->getNumElements(), -1);
+ IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1);
for (auto &Use : I.uses()) {
// All uses must be ShuffleVector instructions.
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
if (!Shuffle)
- return {};
+ return std::nullopt;
// Get index range for value.
- auto *Op0 = Shuffle->getOperand(0u);
- auto *Op1 = Shuffle->getOperand(1u);
- if (!IsPoisonOrUndef(Op1))
- return {};
+ auto *Op0 = Shuffle->getOperand(0);
+ auto *Op1 = Shuffle->getOperand(1);
+ if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1))
+ return std::nullopt;
// Find the min and max indices used by the ShuffleVector instruction.
- auto Mask = Shuffle->getShuffleMask();
+ ArrayRef<int> Mask = Shuffle->getShuffleMask();
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
auto NumElems = int(Op0Ty->getNumElements());
- for (auto Index : Mask) {
+ for (int Index : Mask) {
if (Index >= 0) {
Index %= NumElems;
OutputRange.first = std::min(Index, OutputRange.first);
@@ -3452,15 +3449,18 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
}
if (OutputRange.second < OutputRange.first)
- return {};
+ return std::nullopt;
return OutputRange;
};
+ // Find the range of vector elements used by shufflevector ops, if possible.
if (auto Indices = GetIndexRangeInShuffles()) {
- auto OldSize = VecTy->getNumElements();
- auto NewSize = Indices->second + 1u;
+ unsigned OldSize = VecTy->getNumElements();
+ unsigned NewSize = Indices->second + 1u;
+ // If the range of vector elements is smaller than the full load, attempt
+ // to create a smaller load.
if (NewSize < OldSize) {
auto Builder = IRBuilder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
>From b2a69d898812857471c9d975c8376f68e860ff1e Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 26 Mar 2025 12:37:30 +0000
Subject: [PATCH 11/11] Fix conflict with subvector load widening.
---
clang/test/CodeGenOpenCL/preserve_vec3.cl | 22 +++---
.../Transforms/Vectorize/VectorCombine.cpp | 29 ++++----
.../PhaseOrdering/X86/vec-load-combine.ll | 8 +-
.../VectorCombine/X86/load-inseltpoison.ll | 10 +--
.../VectorCombine/X86/load-widening.ll | 4 +-
.../VectorCombine/X86/shuffle-of-shuffles.ll | 24 +++---
.../VectorCombine/load-shufflevector.ll | 74 +++++++++----------
7 files changed, 83 insertions(+), 88 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 49ebae6fc7013..e73657e30d884 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index abe16f362c986..ecbf4c16b1671 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -33,6 +33,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
+#include <optional>
#include <queue>
#include <tuple>
@@ -3403,38 +3404,38 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
// Attempt to shrink loads that are only used by shufflevector instructions.
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *InputShuffle = dyn_cast<ShuffleVectorInst>(&I);
- if (!InputShuffle)
- return false;
-
- auto *OldLoad = dyn_cast<LoadInst>(InputShuffle->getOperand(0u));
+ auto *OldLoad = dyn_cast<LoadInst>(&I);
if (!OldLoad || !OldLoad->isSimple())
return false;
- auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ auto *VecTy = dyn_cast<FixedVectorType>(OldLoad->getType());
if (!VecTy)
return false;
- // Search all uses of `I`. If all uses are shufflevector ops, and the second
- // operands are all poison values, find the minimum and maximum indices of
- // the vector elements referenced by all shuffle masks.
+ // Search all uses of load. If all uses are shufflevector instructions, and
+ // the second operands are all poison values, find the minimum and maximum
+ // indices of the vector elements referenced by all shuffle masks.
// Otherwise return `std::nullopt`.
using IndexRange = std::pair<int, int>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1);
for (auto &Use : I.uses()) {
- // All uses must be ShuffleVector instructions.
+ // All uses must be shufflevector instructions.
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
if (!Shuffle)
return std::nullopt;
- // Get index range for value.
+ // Ignore shufflevector instructions that have no uses.
+ if (!Shuffle->hasNUsesOrMore(1u))
+ continue;
+
+ // Ensure second operand is a poison/undef value.
auto *Op0 = Shuffle->getOperand(0);
auto *Op1 = Shuffle->getOperand(1);
if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1))
return std::nullopt;
- // Find the min and max indices used by the ShuffleVector instruction.
+ // Find the min and max indices used by the shufflevector instruction.
ArrayRef<int> Mask = Shuffle->getShuffleMask();
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
auto NumElems = int(Op0Ty->getNumElements());
@@ -3454,7 +3455,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
return OutputRange;
};
- // Find the range of vector elements used by shufflevector ops, if possible.
+ // Get the range of vector elements used by shufflevector instructions.
if (auto Indices = GetIndexRangeInShuffles()) {
unsigned OldSize = VecTy->getNumElements();
unsigned NewSize = Indices->second + 1u;
@@ -3599,6 +3600,8 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
+ break;
+ case Instruction::Load:
MadeChange |= shrinkLoadForShuffles(I);
break;
case Instruction::BitCast:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 85f6fceb5bdbe..9218cc2d019f8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 70197a756c8b3..605c029f12b33 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,8 +252,7 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -341,8 +340,7 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
@@ -577,8 +575,8 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
; CHECK-NEXT: [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
; CHECK-NEXT: store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P]], align 4
-; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[L]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%l = load <1 x i32>, ptr %p, align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index 30a089818074e..eacc40bfa9b53 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index b30dc9ffdc596..eddfc57a7d256 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
-; SSE-NEXT: ret <4 x double> [[BLEND]]
-;
-; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT: ret <4 x double> [[BLEND]]
+; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
+; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 56c51ce7315cf..c42df19a0b7a6 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -33,8 +33,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -47,8 +47,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
entry:
@@ -61,8 +61,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -75,13 +75,13 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -108,13 +108,13 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -141,13 +141,13 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -188,8 +188,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -202,8 +202,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
entry:
@@ -216,8 +216,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -230,13 +230,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -263,13 +263,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -296,13 +296,13 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -329,13 +329,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
More information about the llvm-commits
mailing list