[llvm] r276104 - [X86][SSE] Add cost model values for CTPOP of vectors
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 20 03:41:28 PDT 2016
Author: rksimon
Date: Wed Jul 20 05:41:28 2016
New Revision: 276104
URL: http://llvm.org/viewvc/llvm-project?rev=276104&view=rev
Log:
[X86][SSE] Add cost model values for CTPOP of vectors
This patch adds costs for the vectorized implementations of CTPOP, the default values were seriously underestimating the cost of these and was encouraging vectorization on targets where serialized use of POPCNT would be much better.
Differential Revision: https://reviews.llvm.org/D22456
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/test/Analysis/CostModel/X86/ctbits-cost.ll
llvm/trunk/test/Transforms/SLPVectorizer/X86/ctpop.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=276104&r1=276103&r2=276104&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jul 20 05:41:28 2016
@@ -21052,6 +21052,8 @@ static SDValue LowerVectorCTPOPBitmath(S
DAG);
}
+// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
+// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=276104&r1=276103&r2=276104&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Wed Jul 20 05:41:28 2016
@@ -945,6 +945,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsig
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys, FastMathFlags FMF) {
+ // Costs should match the codegen from:
+ // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
+ // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
+ // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
{ ISD::BITREVERSE, MVT::v8i32, 4 },
@@ -966,7 +970,11 @@ int X86TTIImpl::getIntrinsicInstrCost(In
{ ISD::BITREVERSE, MVT::v32i8, 5 },
{ ISD::BSWAP, MVT::v4i64, 1 },
{ ISD::BSWAP, MVT::v8i32, 1 },
- { ISD::BSWAP, MVT::v16i16, 1 }
+ { ISD::BSWAP, MVT::v16i16, 1 },
+ { ISD::CTPOP, MVT::v4i64, 7 },
+ { ISD::CTPOP, MVT::v8i32, 11 },
+ { ISD::CTPOP, MVT::v16i16, 9 },
+ { ISD::CTPOP, MVT::v32i8, 6 }
};
static const CostTblEntry AVX1CostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 10 },
@@ -975,7 +983,11 @@ int X86TTIImpl::getIntrinsicInstrCost(In
{ ISD::BITREVERSE, MVT::v32i8, 10 },
{ ISD::BSWAP, MVT::v4i64, 4 },
{ ISD::BSWAP, MVT::v8i32, 4 },
- { ISD::BSWAP, MVT::v16i16, 4 }
+ { ISD::BSWAP, MVT::v16i16, 4 },
+ { ISD::CTPOP, MVT::v4i64, 14 },
+ { ISD::CTPOP, MVT::v8i32, 22 },
+ { ISD::CTPOP, MVT::v16i16, 18 },
+ { ISD::CTPOP, MVT::v32i8, 12 }
};
static const CostTblEntry SSSE3CostTbl[] = {
{ ISD::BITREVERSE, MVT::v2i64, 5 },
@@ -984,12 +996,20 @@ int X86TTIImpl::getIntrinsicInstrCost(In
{ ISD::BITREVERSE, MVT::v16i8, 5 },
{ ISD::BSWAP, MVT::v2i64, 1 },
{ ISD::BSWAP, MVT::v4i32, 1 },
- { ISD::BSWAP, MVT::v8i16, 1 }
+ { ISD::BSWAP, MVT::v8i16, 1 },
+ { ISD::CTPOP, MVT::v2i64, 7 },
+ { ISD::CTPOP, MVT::v4i32, 11 },
+ { ISD::CTPOP, MVT::v8i16, 9 },
+ { ISD::CTPOP, MVT::v16i8, 6 }
};
static const CostTblEntry SSE2CostTbl[] = {
{ ISD::BSWAP, MVT::v2i64, 7 },
{ ISD::BSWAP, MVT::v4i32, 7 },
- { ISD::BSWAP, MVT::v8i16, 7 }
+ { ISD::BSWAP, MVT::v8i16, 7 },
+ { ISD::CTPOP, MVT::v2i64, 12 },
+ { ISD::CTPOP, MVT::v4i32, 15 },
+ { ISD::CTPOP, MVT::v8i16, 13 },
+ { ISD::CTPOP, MVT::v16i8, 10 }
};
unsigned ISD = ISD::DELETED_NODE;
@@ -1002,6 +1022,9 @@ int X86TTIImpl::getIntrinsicInstrCost(In
case Intrinsic::bswap:
ISD = ISD::BSWAP;
break;
+ case Intrinsic::ctpop:
+ ISD = ISD::CTPOP;
+ break;
}
// Legalize the type.
Modified: llvm/trunk/test/Analysis/CostModel/X86/ctbits-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/ctbits-cost.ll?rev=276104&r1=276103&r2=276104&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/ctbits-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/ctbits-cost.ll Wed Jul 20 05:41:28 2016
@@ -58,72 +58,88 @@ declare <32 x i8> @llvm.ctpop.v32i8(<32
define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v2i64':
-; SSE: Found an estimated cost of 2 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 12 for instruction: %ctpop
+; SSE42: Found an estimated cost of 7 for instruction: %ctpop
+; AVX: Found an estimated cost of 7 for instruction: %ctpop
+; XOP: Found an estimated cost of 7 for instruction: %ctpop
%ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
ret <2 x i64> %ctpop
}
define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i64':
-; SSE: Found an estimated cost of 4 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 24 for instruction: %ctpop
+; SSE42: Found an estimated cost of 14 for instruction: %ctpop
+; AVX1: Found an estimated cost of 14 for instruction: %ctpop
+; AVX2: Found an estimated cost of 7 for instruction: %ctpop
+; XOPAVX1: Found an estimated cost of 14 for instruction: %ctpop
+; XOPAVX2: Found an estimated cost of 7 for instruction: %ctpop
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
ret <4 x i64> %ctpop
}
define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i32':
-; SSE: Found an estimated cost of 2 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 15 for instruction: %ctpop
+; SSE42: Found an estimated cost of 11 for instruction: %ctpop
+; AVX: Found an estimated cost of 11 for instruction: %ctpop
+; XOP: Found an estimated cost of 11 for instruction: %ctpop
%ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
ret <4 x i32> %ctpop
}
define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i32':
-; SSE: Found an estimated cost of 4 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 30 for instruction: %ctpop
+; SSE42: Found an estimated cost of 22 for instruction: %ctpop
+; AVX1: Found an estimated cost of 22 for instruction: %ctpop
+; AVX2: Found an estimated cost of 11 for instruction: %ctpop
+; XOPAVX1: Found an estimated cost of 22 for instruction: %ctpop
+; XOPAVX2: Found an estimated cost of 11 for instruction: %ctpop
%ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
ret <8 x i32> %ctpop
}
define <8 x i16> @var_ctpop_v8i16(<8 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i16':
-; SSE: Found an estimated cost of 2 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 13 for instruction: %ctpop
+; SSE42: Found an estimated cost of 9 for instruction: %ctpop
+; AVX: Found an estimated cost of 9 for instruction: %ctpop
+; XOP: Found an estimated cost of 9 for instruction: %ctpop
%ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
ret <8 x i16> %ctpop
}
define <16 x i16> @var_ctpop_v16i16(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i16':
-; SSE: Found an estimated cost of 4 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 26 for instruction: %ctpop
+; SSE42: Found an estimated cost of 18 for instruction: %ctpop
+; AVX1: Found an estimated cost of 18 for instruction: %ctpop
+; AVX2: Found an estimated cost of 9 for instruction: %ctpop
+; XOPAVX1: Found an estimated cost of 18 for instruction: %ctpop
+; XOPAVX2: Found an estimated cost of 9 for instruction: %ctpop
%ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
ret <16 x i16> %ctpop
}
define <16 x i8> @var_ctpop_v16i8(<16 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i8':
-; SSE: Found an estimated cost of 2 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 10 for instruction: %ctpop
+; SSE42: Found an estimated cost of 6 for instruction: %ctpop
+; AVX: Found an estimated cost of 6 for instruction: %ctpop
+; XOP: Found an estimated cost of 6 for instruction: %ctpop
%ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
ret <16 x i8> %ctpop
}
define <32 x i8> @var_ctpop_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v32i8':
-; SSE: Found an estimated cost of 4 for instruction: %ctpop
-; AVX: Found an estimated cost of 2 for instruction: %ctpop
-; XOP: Found an estimated cost of 2 for instruction: %ctpop
+; SSE2: Found an estimated cost of 20 for instruction: %ctpop
+; SSE42: Found an estimated cost of 12 for instruction: %ctpop
+; AVX1: Found an estimated cost of 12 for instruction: %ctpop
+; AVX2: Found an estimated cost of 6 for instruction: %ctpop
+; XOPAVX1: Found an estimated cost of 12 for instruction: %ctpop
+; XOPAVX2: Found an estimated cost of 6 for instruction: %ctpop
%ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
ret <32 x i8> %ctpop
}
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/ctpop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/ctpop.ll?rev=276104&r1=276103&r2=276104&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/ctpop.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/ctpop.ll Wed Jul 20 05:41:28 2016
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+sse2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+avx,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+avx2,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -21,9 +22,12 @@ declare i8 @llvm.ctpop.i8(i8)
define void @ctpop_2i64() #0 {
; CHECK-LABEL: @ctpop_2i64(
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
-; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
+; CHECK-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
+; CHECK-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; CHECK-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
; CHECK-NEXT: ret void
;
%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
@@ -37,19 +41,40 @@ define void @ctpop_2i64() #0 {
define void @ctpop_4i64() #0 {
; SSE-LABEL: @ctpop_4i64(
-; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
-; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
-; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
-; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]])
-; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
-; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
+; SSE-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
+; SSE-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
+; SSE-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
+; SSE-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; SSE-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; SSE-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; SSE-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
; SSE-NEXT: ret void
;
-; AVX-LABEL: @ctpop_4i64(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
-; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP1]])
-; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
-; AVX-NEXT: ret void
+; AVX1-LABEL: @ctpop_4i64(
+; AVX1-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX1-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX1-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX1-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX1-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
+; AVX1-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
+; AVX1-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
+; AVX1-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
+; AVX1-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; AVX1-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; AVX1-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; AVX1-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @ctpop_4i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP1]])
+; AVX2-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX2-NEXT: ret void
;
%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -67,11 +92,41 @@ define void @ctpop_4i64() #0 {
}
define void @ctpop_4i32() #0 {
-; CHECK-LABEL: @ctpop_4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; CHECK-NEXT: ret void
+; SSE2-LABEL: @ctpop_4i32(
+; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
+; SSE2-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE2-NEXT: ret void
+;
+; SSE42-LABEL: @ctpop_4i32(
+; SSE42-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; SSE42-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; SSE42-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; SSE42-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; SSE42-NEXT: ret void
+;
+; AVX-LABEL: @ctpop_4i32(
+; AVX-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; AVX-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; AVX-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; AVX-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; AVX-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; AVX-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; AVX-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; AVX-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; AVX-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX-NEXT: ret void
;
%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
@@ -89,20 +144,74 @@ define void @ctpop_4i32() #0 {
}
define void @ctpop_8i32() #0 {
-; SSE-LABEL: @ctpop_8i32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
-; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]])
-; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE-NEXT: ret void
-;
-; AVX-LABEL: @ctpop_8i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
-; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> [[TMP1]])
-; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
-; AVX-NEXT: ret void
+; SSE2-LABEL: @ctpop_8i32(
+; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE2-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
+; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]])
+; SSE2-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE2-NEXT: ret void
+;
+; SSE42-LABEL: @ctpop_8i32(
+; SSE42-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; SSE42-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; SSE42-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; SSE42-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; SSE42-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; SSE42-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; SSE42-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; SSE42-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; SSE42-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
+; SSE42-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
+; SSE42-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
+; SSE42-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
+; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; SSE42-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; SSE42-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; SSE42-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; SSE42-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE42-NEXT: ret void
+;
+; AVX1-LABEL: @ctpop_8i32(
+; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; AVX1-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; AVX1-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; AVX1-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; AVX1-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; AVX1-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
+; AVX1-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
+; AVX1-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
+; AVX1-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
+; AVX1-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; AVX1-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @ctpop_8i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> [[TMP1]])
+; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX2-NEXT: ret void
;
%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
More information about the llvm-commits
mailing list