擅长:python、mysql、java
<p>好吧,我想我找到了一个苏丹,它不是很优雅,但它至少有效!
这应该是个更好的方法,有什么建议吗?在</p>
<pre><code>extern "C"{
int foobar(float * ndarray1,float * ndarray2,int path_cnt)
{
float * test = (float*)_mm_malloc(path_cnt*sizeof(float),32);
float * test2 = (float*)_mm_malloc(path_cnt*sizeof(float),32);
//copy to aligned memory(this part is kinda stupid)
for(int i=0;i<path_cnt;i++)
{
test[i] = stock[i];
test2[i] = max_vola[i];
}
for(int i=0;i<path_cnt;i=i+8)
{
__m256 arr1 = _mm256_load_ps(&test1[i]);
__m256 arr2 = _mm256_load_ps(&test2[i]);
__m256 add = _mm256_add_ps(arr1,arr2);
_mm256_store_ps(&test1[i],add);
}
//and copy everything back!
for(int i=0;i<path_cnt;i++)
{
stock[i] = test[i];
}
return 0;
}
}
</code></pre>