使用ctypes和SSE/AVX的python有时会出现segfaults

from ctypes import * import numpy as np #path_cnt path_cnt = 16 c_path_cnt = c_int(path_cnt) #ndarray1 ndarray1 = np.ones(path_cnt,dtype=np.float32,order='C') ndarray1.setflags(align=1,write=1) c_ndarray1 = stock.ctypes.data_as(POINTER(c_float)) #ndarray2 ndarray2 = np.ones(path_cnt,dtype=np.float32,order='C'); ndarray2.setflags(align=1,write=1) c_ndarray2 = max_vola.ctypes.data_as(POINTER(c_float)) #call function finance = cdll.LoadLibrary(".../libfin.so") finance.foobar.argtypes = [c_void_p, c_void_p,c_int] finance.foobar(c_ndarray1,c_ndarray2,c_path_cnt) x=0 while x < path_cnt: print c_stock[x] x+=1

2条回答

网友

1楼 · 编辑于 2024-10-08 18:23:59

有对齐和未对齐的加载指令。如果你违反了对齐规则，对齐的会出错，但是它们更快。未对齐的接收任何地址，并在内部进行加载/移位以获取所需的数据。您使用的是对齐版本_mm256_load_ps，并且可以切换到未对齐的版本_mm256_loadu_ps，而无需任何中间分配。在

一个好的矢量化编译器将包括一个导入循环以到达一个对齐的地址，然后是一个处理对齐数据的主体，然后是一个最后一个循环来清除任何偏离。在

网友

2楼 · 编辑于 2024-10-08 18:23:59

好吧，我想我找到了一个苏丹，它不是很优雅，但它至少有效！这应该是个更好的方法，有什么建议吗？在

extern "C"{
int foobar(float * ndarray1,float * ndarray2,int path_cnt)
 {
     float * test = (float*)_mm_malloc(path_cnt*sizeof(float),32);
     float * test2 = (float*)_mm_malloc(path_cnt*sizeof(float),32);
     //copy to aligned memory(this part is kinda stupid)
     for(int i=0;i<path_cnt;i++)
     {
        test[i] = stock[i];
        test2[i] = max_vola[i];

     }
     for(int i=0;i<path_cnt;i=i+8)
     {
         __m256 arr1                = _mm256_load_ps(&test1[i]);
         __m256 arr2                    = _mm256_load_ps(&test2[i]);
         __m256 add                 = _mm256_add_ps(arr1,arr2);
         _mm256_store_ps(&test1[i],add);
     }
  //and copy everything back!
   for(int i=0;i<path_cnt;i++)
    {
    stock[i] = test[i];   
    }
     return 0;
 }
}

相关问题更多 >

编程相关推荐

热门问题

热门文章