linux下用pthread,单线程运行SSE2程序比多线程快,这是为什么???
这是单线程版本(程序和简单,就是将大数组相加,重复多次)
#include <stdio.h>
#include <stdlib.h>
#include <emmintrin.h>
void add(float *a,float *b,float *c,int len)
{
int i;
__m128 t0,t1;
for(i=0;i<len/4;i++){
t0 = _mm_load_ps(a+i*4);
t1 = _mm_load_ps(b+i*4);
t0 = _mm_add_ps(t0,t1);
_mm_store_ps(c+i*4,t0);
}
}
int main()
{
int i;
int len = 990000 ;
float a[len];
float b[len];
float c[len];
for(i=0;i<len;i++)
{
a[i] = b[i] = 3;
}
for(i=0;i<1000000;i++)
add(a,b,c,len);
return 0;
}
---------------------------------------------------
这是多线程版:
#include <stdio.h>
#include <stdlib.h>
#include <emmintrin.h>
#include <pthread.h>
int num_t = 4 ;
static int count;
typedef struct{
float *a;
float *b;
float *c;
int len;
}myData;
void *myThread(myData *data)
{
__m128 t0,t1;
while(count<data->len/4){
//printf("%d\n",count);
t0 = _mm_load_ps(data->a+count*4);
t1 = _mm_load_ps(data->b+count*4);
t0 = _mm_add_ps(t0,t1);
_mm_store_ps(data->c+count*4,t0);
count++;
}
}
void add(float *a,float *b,float *c,int len)
{
count = 0;
int i;
myData data;
data.a = a;
data.b = b;
data.c = c;
data.len = len;
pthread_t pid[num_t];
for(i=0;i<num_t;i++){
if(pthread_create(&pid[i],NULL,(void *)myThread,&data ))
{
printf("Create thread failed \n");
exit(-1);
}
}
for(i=0;i<num_t;i++){
pthread_join(pid[i],NULL);
}
}
int main()
{
int i;
int len = 990000 ;
float a[len];
float b[len];
float c[len];
for(i=0;i<len;i++)
{
a[i] = b[i] = 3;
}
for(i=0;i<1000000;i++)
add(a,b,c,len);
return 0;
}
谁能告诉我为什么啊??? linux 多线程 pthread SSE2
[解决办法]
能不能附上makefile,我运行一下???
[解决办法]
楼主你的代码有很大问题,单线程运行100万次add,多线程也是运行100万次add;
每次add,多线程版本内部都要启用线程,单线程版本不启动,当然多线程比单线程慢了。。
[解决办法]
同一份代码,编译时指定线程数量
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <emmintrin.h>
#include <time.h>
typedef struct
{
float *a;
float *b;
float *c;
int len;
int repeat; //重复次数,这个值越大,多线程的优势越明显
time_t sec; //消耗的时间
}myData;
void* myThread(myData *data)
{
//__m128 t0,t1;
int count = 0;
time_t start = time(0);
while(data->repeat > 0)
{
for(count = 0; count < data->len; ++count)
{
t0 = _mm_load_ps(data->a+count*4);
t1 = _mm_load_ps(data->b+count*4);
t0 = _mm_add_ps(t0,t1);
_mm_store_ps(data->c+count*4,t0);
}
--data->repeat;
}
time_t end = time(0);
data->sec = end - start;
return NULL;
}
void add(float *a,float *b,float *c,int len)
{
myData data[num_t] = {{0}};
pthread_t pid[num_t];
int i;
for(i = 0; i < num_t; i++)
{
data[i].len = len/num_t;
if(i == num_t -1)
{
data[i].len += len % num_t;
}
data[i].a = a + len/num_t * i;
data[i].b = b + len/num_t * i;
data[i].c = c + len/num_t * i;
data[i].repeat = 4096;
if(pthread_create(pid+i,NULL,(void *)myThread, data + i))
{
printf("Create thread failed \n");
exit(-1);
}
}
for(i=0;i<num_t;i++){
pthread_join(pid[i],NULL);
}
time_t total = 0;
for(i=0;i<num_t;i++){
total += data[i].sec;
}
printf("%d sec\n", (int)total);
}
int main()
{
enum
{
len = 1024*1024
};
static float a[len];
static float b[len];
static float c[len];
add(a,b,c,len);
return 0;
}
gcc -Dnum_t=4 main.c -lpthread
./a.out
gcc -Dnum_t=1 main.c -lpthread
./a.out