[LLVMdev] Seg faulting on vector ops

Tue Jul 24 13:58:05 PDT 2007

Hrm. This problem shouldn't be target specific. I am pretty sure  
prologue / epilogue inserter aligns stack correctly if there are  
stack objects with greater than default stack alignment requirement.  
Seems to be the initial alloca() instruction should specify 16 byte  
alignment?

Evan

On Jul 21, 2007, at 2:51 PM, Chris Lattner wrote:

> On Fri, 20 Jul 2007, Chuck Rose III wrote:
>> I'm looking to make use of the vectorization primitives in the Intel
>> chip with the code we generate from LLVM and so I've started
>> experimenting with it.  What is the state of the machine code  
>> generated
>> for vectors?  In my tinkering, I seem to be getting some wonky  
>> machine
>> instructions, but I'm most likely just doing something wrong and I'm
>> hoping you can set me in the correct course.
>
> Hi Chuck,
>
> Evan's solution is the right one.  However, your code is valid, so it
> shouldn't crash.  I think it dies because linux does not guarantee  
> that
> the stack is 16 byte aligned, and the vector operations expect  
> this.  The
> code generator should compensate and dynamically align the stack on  
> entry
> to the function.  This should be a relatively straight-forward  
> extension
> to the x86 backend if you're interested.
>
> -Chris
>
>> My minimal function creates a float4 vector with a specified  
>> scalar in
>> all the elements.  It then extracts the third element and returns it.
>>
>>
>>
>> We are currently using the JIT and I'm currently synced to about a  
>> week
>> after the 2.0 branch, so I'm admittedly stale by about a month.
>>
>>
>>
>> In LLVM IR:
>>
>>
>>
>> ; ModuleID = 'test vectors'
>>
>>
>>
>> define float @vSelect3(float %x) {
>>
>> body:
>>
>>        %pv = alloca <4 x float>                ; <<4 x float>*>
>> [#uses=1]
>>
>>        %v = load <4 x float>* %pv              ; <<4 x float>>
>> [#uses=1]
>>
>>        %v1 = insertelement <4 x float> %v, float %x, i32 0
>> ; <<4 x
>>
>> float>> [#uses=1]
>>
>>        %v2 = insertelement <4 x float> %v1, float %x, i32 1
>> ; <<4 x
>>
>> float>> [#uses=1]
>>
>>        %v3 = insertelement <4 x float> %v2, float %x, i32 2
>> ; <<4 x
>>
>> float>> [#uses=1]
>>
>>        %v4 = insertelement <4 x float> %v3, float %x, i32 3
>> ; <<4 x
>>
>> float>> [#uses=1]
>>
>>        %s = extractelement <4 x float> %v4, i32 3              ;
>> <float> [#uses
>>
>> =1]
>>
>>        ret float %s
>>
>> }
>>
>>
>>
>> In Intel assembly, I get the following:
>>
>>
>>
>> 00000000`01b80010 83ec20          sub     esp,20h
>>
>> 00000000`01b80013 f30f10442424    movss   xmm0,dword ptr [esp 
>> +24h]   <--
>> this loads x into the low float of xmm0
>>
>> 00000000`01b80019 0f284c2404      movaps  xmm1,xmmword ptr [esp 
>> +4]   <--
>> this seg faults because esp+4 isn't 16-byte aligned
>>
>> What is that line trying to achieve?  X is at [esp+24].  There  
>> weren't
>> any other parameters.
>>
>>
>>
>> 00000000`01b8001e f30f10c8        movss   xmm1,xmm0
>>
>> 00000000`01b80022 8b442424        mov     eax,dword ptr [esp+24h]
>>
>> 00000000`01b80026 660fc4c802      pinsrw  xmm1,eax,2
>>
>> 00000000`01b8002b 89c1            mov     ecx,eax
>>
>> 00000000`01b8002d c1e910          shr     ecx,10h
>>
>> 00000000`01b80030 660fc4c903      pinsrw  xmm1,ecx,3
>>
>> 00000000`01b80035 660fc4c804      pinsrw  xmm1,eax,4
>>
>> 00000000`01b8003a 660fc4c905      pinsrw  xmm1,ecx,5
>>
>> 00000000`01b8003f 660fc4c806      pinsrw  xmm1,eax,6
>>
>> 00000000`01b80044 660fc4c907      pinsrw  xmm1,ecx,7
>>
>> 00000000`01b80049 0fc6c903        shufps  xmm1,xmm1,3
>>
>> 00000000`01b8004d f30f110c24      movss   dword ptr [esp],xmm1
>>
>> 00000000`01b80052 d90424          fld     dword ptr [esp]
>>
>> 00000000`01b80055 83c420          add     esp,20h
>>
>> 00000000`01b80058 c3              ret
>>
>>
>>
>> The code used to generate and run the program was:
>>
>>
>>
>> #include "llvm/Module.h"
>>
>> #include "llvm/DerivedTypes.h"
>>
>> #include "llvm/Constants.h"
>>
>> #include "llvm/Instructions.h"
>>
>> #include "llvm/ModuleProvider.h"
>>
>> #include "llvm/Analysis/Verifier.h"
>>
>> #include "llvm/System/DynamicLibrary.h"
>>
>> #include "llvm/ExecutionEngine/JIT.h"
>>
>> #include "llvm/ExecutionEngine/Interpreter.h"
>>
>> #include "llvm/ExecutionEngine/GenericValue.h"
>>
>> #include "llvm/Support/ManagedStatic.h"
>>
>> #include <iostream>
>>
>> using namespace llvm;
>>
>>
>>
>> Value* makeVector(Value* s, unsigned int dim, BasicBlock* basicBlock)
>>
>> {
>>
>>    AllocaInst* pV = new
>> AllocaInst(VectorType::get(Type::FloatTy,dim),"pv",basicBlock);
>>
>>    Value* v = new LoadInst(pV,"v",basicBlock);
>>
>>
>>
>>    for (unsigned int i = 0 ; i < dim ; ++i)
>>
>>        v = new InsertElementInst(v,s,i,"v",basicBlock);
>>
>>
>>
>>    return v;
>>
>> }
>>
>>
>>
>> Function* generateVectorAndSelect(Module* pModule)
>>
>> {
>>
>>    std::vector<Type const*> params;
>>
>>
>>
>>    params.push_back(Type::FloatTy);
>>
>>
>>
>>    FunctionType* funcType =
>> FunctionType::get(Type::FloatTy,params,NULL);
>>
>>    Function* func =
>> cast<Function>(pModule->getOrInsertFunction("vSelect3",funcType));
>>
>>
>>
>>    BasicBlock* basicBlock = new BasicBlock("body",func);
>>
>>
>>
>>    Function::arg_iterator args = func->arg_begin();
>>
>>    Argument* x = args;
>>
>>    x->setName("x");
>>
>>
>>
>>    Value* v1 = makeVector(x,4,basicBlock);
>>
>>
>>
>>    Value* s = new ExtractElementInst(v1,3,"s",basicBlock);
>>
>>
>>
>>    new ReturnInst(s,basicBlock);
>>
>>
>>
>>    return func;
>>
>> }
>>
>>
>>
>> // modified from the fibonacci example
>>
>> int main(int argc, char **argv)
>>
>> {
>>
>>    Module* pVectorModule = new Module("test vectors");
>>
>>
>>
>>    Function* pMain = generateVectorAndSelect(pVectorModule);
>>
>>
>>
>>    pVectorModule->print(std::cout);
>>
>>
>>
>>    GenericValue gv1, gv2, gvR;
>>
>>
>>
>>    gv1.FloatVal = 2.0f;
>>
>>
>>
>>    ExistingModuleProvider *pMP = new
>> ExistingModuleProvider(pVectorModule);
>>
>>
>> pMP->getModule()->setDataLayout("e-p:32:32:32-i1:8:8:8-i8:8:8:8- 
>> i32:32:3
>> 2:32-f32:32:32:32");
>>
>>    ExecutionEngine *pEE = ExecutionEngine::create(pMP, false);
>>
>>
>>
>>    std::vector<GenericValue> args;
>>
>>
>>
>>    args.push_back(gv1);
>>
>>
>>
>>    GenericValue result = pEE->runFunction(pMain, args);
>>
>>
>>
>>    return 0;
>>
>> }
>>
>>
>>
>>
>>
>> Any help would be appreciated.
>>
>> .
>>
>> Thanks,
>>
>> Chuck.
>>
>>
>
> -Chris
>
> -- 
> http://nondot.org/sabre/
> http://llvm.org/
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev