[llvm] [X86] Fix arithmetic error in extractVector (PR #128052)

Fri Feb 21 07:37:08 PST 2025

https://github.com/daniel-zabawa updated https://github.com/llvm/llvm-project/pull/128052

>From 1df623a21c748294f684f9564c1ce97bbe654ad4 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Thu, 20 Feb 2025 11:09:40 -0800
Subject: [PATCH 1/2] [X86] Fix arithmetic error in extractVector

The computation of the element count for the result VT in extractVector
is incorrect when vector width does not divide VT.getSizeInBits(), which
can occur when the source vector element count is not a power of two,
e.g. extracting a vectorWidth 256b vector from a 384b source.

This rewrites the expression so the division is exact given that
vectorWidth is a multiple of the source element size.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  6 ++---
 llvm/test/CodeGen/X86/pr128052.ll       | 30 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr128052.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1c9d43ce4c062..d79dd9d5cdd72 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4066,9 +4066,9 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
                                 const SDLoc &dl, unsigned vectorWidth) {
   EVT VT = Vec.getValueType();
   EVT ElVT = VT.getVectorElementType();
-  unsigned Factor = VT.getSizeInBits() / vectorWidth;
-  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
-                                  VT.getVectorNumElements() / Factor);
+  unsigned ResultNumElts =
+      (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
 
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
diff --git a/llvm/test/CodeGen/X86/pr128052.ll b/llvm/test/CodeGen/X86/pr128052.ll
new file mode 100644
index 0000000000000..1a67e64b69832
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr128052.ll
@@ -0,0 +1,30 @@
+; Ensure assertion is not hit when folding concat of two contiguous extract_subvector operations
+; from a source with a non-power-of-two vector length.
+; RUN: llc -mattr=+avx2 < %s
+
+source_filename = "foo.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(ptr noundef %pDst, ptr noundef %pSrc) {
+bb0:
+  %sptr1 = getelementptr i8, ptr %pSrc, i64 32
+  %load598 = load <12 x float>, ptr %sptr1, align 1
+  br label %bb1
+bb1:
+  %sptr0 = getelementptr i8, ptr %pSrc, i64 16
+  %load617 = load <12 x float>, ptr %sptr0, align 1
+  %42 = fsub contract <12 x float> %load617, %load598
+  %43 = shufflevector <12 x float> %42, <12 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %44 = fsub contract <12 x float> %load617, %load598
+  %45 = shufflevector <12 x float> %44, <12 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %46 = fsub contract <12 x float> %load617, %load598
+  %47 = shufflevector <12 x float> %46, <12 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %dptr0 = getelementptr i8, ptr %pDst, i64 16
+  %dptr1 = getelementptr i8, ptr %pDst, i64 32 
+  %dptr2 = getelementptr i8, ptr %pDst, i64 48
+  store <4 x float> %43, ptr %dptr0, align 1
+  store <4 x float> %45, ptr %dptr1, align 1
+  store <4 x float> %47, ptr %dptr2, align 1
+  ret void
+}

>From b338971eefbcce9bfe89a868c2545943b65b88d8 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Fri, 21 Feb 2025 07:36:49 -0800
Subject: [PATCH 2/2] reduce/rename test and add assertion

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  3 ++
 .../isel-extract-subvector-non-pow2-elems.ll  | 21 +++++++++++++
 llvm/test/CodeGen/X86/pr128052.ll             | 30 -------------------
 3 files changed, 24 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/isel-extract-subvector-non-pow2-elems.ll
 delete mode 100644 llvm/test/CodeGen/X86/pr128052.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d79dd9d5cdd72..97510a3091f69 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4070,6 +4070,9 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
       (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
 
+  assert(ResultVT.getSizeInBits() == vectorWidth &&
+         "Illegal subvector extraction");
+
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
diff --git a/llvm/test/CodeGen/X86/isel-extract-subvector-non-pow2-elems.ll b/llvm/test/CodeGen/X86/isel-extract-subvector-non-pow2-elems.ll
new file mode 100644
index 0000000000000..d699b1a182845
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-extract-subvector-non-pow2-elems.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Ensure assertion is not hit when folding concat of two contiguous extract_subvector operations
+; from a source with a non-power-of-two vector length.
+; RUN: llc -mtriple=x86_64 -mattr=+avx2 < %s | FileCheck %s
+
+define void @foo(ptr %pDst) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %ymm0, 16(%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = shufflevector <12 x float> zeroinitializer, <12 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <12 x float> zeroinitializer, <12 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = getelementptr i8, ptr %pDst, i64 16
+  %3 = getelementptr i8, ptr %pDst, i64 32
+  store <4 x float> %0, ptr %2, align 1
+  store <4 x float> %1, ptr %3, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr128052.ll b/llvm/test/CodeGen/X86/pr128052.ll
deleted file mode 100644
index 1a67e64b69832..0000000000000
--- a/llvm/test/CodeGen/X86/pr128052.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; Ensure assertion is not hit when folding concat of two contiguous extract_subvector operations
-; from a source with a non-power-of-two vector length.
-; RUN: llc -mattr=+avx2 < %s
-
-source_filename = "foo.c"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @foo(ptr noundef %pDst, ptr noundef %pSrc) {
-bb0:
-  %sptr1 = getelementptr i8, ptr %pSrc, i64 32
-  %load598 = load <12 x float>, ptr %sptr1, align 1
-  br label %bb1
-bb1:
-  %sptr0 = getelementptr i8, ptr %pSrc, i64 16
-  %load617 = load <12 x float>, ptr %sptr0, align 1
-  %42 = fsub contract <12 x float> %load617, %load598
-  %43 = shufflevector <12 x float> %42, <12 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %44 = fsub contract <12 x float> %load617, %load598
-  %45 = shufflevector <12 x float> %44, <12 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %46 = fsub contract <12 x float> %load617, %load598
-  %47 = shufflevector <12 x float> %46, <12 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %dptr0 = getelementptr i8, ptr %pDst, i64 16
-  %dptr1 = getelementptr i8, ptr %pDst, i64 32 
-  %dptr2 = getelementptr i8, ptr %pDst, i64 48
-  store <4 x float> %43, ptr %dptr0, align 1
-  store <4 x float> %45, ptr %dptr1, align 1
-  store <4 x float> %47, ptr %dptr2, align 1
-  ret void
-}