基于MIC平台的向量加示例
基于MIC平台的向量加示例,包括:OpenMP版本,MIC offload版本,MIC native版本,CPU+MIC offload版本,CPU+MIC对等版本,以及MIC网络配置方法,希望对想学习MIC的网友有些帮助。
1. 向量加串行程序
vectoradd_cpu.cpp
1 #include<stdio.h>
2 #include<stdlib.h>
3
4 #define N200000
5
6 void VecAdd_cpu(float* A,float* B, float* C,int size)
7 {
8 for(int i=0;i<size;i++)
9 C[i] = A[i] + B[i];
10 }
11
12 int main(int argc, char** argv)
13 {
14 int i;
15 int size = N *sizeof(float);
16
17 float *A,*B,*C;
18 A = (float*)malloc(size);
19 B = (float*)malloc(size);
20 C = (float*)malloc(size);
21
22 srand(2013);
23 for(i=0;i<N;i++)
24 {
25 A[i]=rand()%10;
26 B[i]=rand()%10;
27 }
28
29 VecAdd_cpu(A, B, C, N);
30
31 for(i=0;i<N;i+=10000)
32 {
33 printf("%6d:%4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]);
34 }
35
36 free(A);
37 free(B);
38 free(C);
39 }
编译:icpc -O3 -o vectoradd_cpu -cvectoradd_cpu.cpp
运行:./vectoradd_cpu
结果:
0:9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
2. 向量加OpenMP多线程并行程序
vectoradd_omp.cpp
1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<omp.h>
4
5 #define N200000
6
7 void VecAdd_omp(float* A,float* B, float* C,int size)
8 {
9 #pragma omp parallelfor
10 for(int i=0;i<size;i++)
11 C[i] = A[i] + B[i];
12 }
13
14 int main(int argc, char** argv)
15 {
16 int i;
17 int size = N *sizeof(float);
18
19 float *A,*B,*C;
20 A = (float*)malloc(size);
21 B = (float*)malloc(size);
22 C = (float*)malloc(size);
23
24 srand(2013);
25 for(i=0;i<N;i++)
26 {
27 A[i]=rand()%10;
28 B[i]=rand()%10;
29 }
30
31 VecAdd_omp(A, B, C, N);
32
33 for(i=0;i<N;i+=10000)
34 {
35 printf("%6d:%4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]);
36 }
37
38 free(A);
39 free(B);
40 free(C);
41 }
编译:icpc -O3 -openmp -o vectoradd_omp vectoradd_cpu.cpp
运行:./vectoradd_omp
结果:
0:9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
3. 向量加MIC offload多线程并行程序
vectoradd_mic_offload.cpp
1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<omp.h>
4
5 #define N200000
6
7 __attribute__((target(mic)))void offload_check(void)
8 {
9 #ifdef __MIC__
10 printf("Code running on MIC\n");
11 #else
12 printf("Code running on host\n");
13 #endif
14 }
15
16 __attribute__((target(mic)))
17 void VecAdd_mic(float* A,float* B, float* C,int size)
18 {
19 #pragma omp parallelfor
20 for(int i=0;i<size;i++)
21 C[i] = A[i] + B[i];
22 }
23
24 int main(int argc, char** argv)
25 {
26 int i;
27 int size = N *sizeof(float);
28
29 float *A,*B,*C;
30 A = (float*)malloc(size);
31 B = (float*)malloc(size);
32 C = (float*)malloc(size);
33
34 srand(2013);
35 for(i=0;i<N;i++)
36 {
37 A[i]=rand()%10;
38 B[i]=rand()%10;
39 }
40
41 #pragma offload target(mic) in(A,B: length(N)) out(C: length(N))
42 {
43 offload_check();
44 VecAdd_mic(A, B, C, N);
45 }
46
47 for(i=0;i<N;i+=10000)
48 {
49 printf("%6d:%4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]);
50 }
51
52 free(A);
53 free(B);
54 free(C);
55 }
编译:icpc -O3 -openmp -ovectoradd_mic_offload vectoradd_mic_offload.cpp
运行:./vectoradd_mic_offload
结果:
0:9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
Code running on MIC
4. 向量加MIC native多线程并行程序
vectoradd_mic_native.cpp
1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<omp.h>
4
5 #define N200000
6
7 void VecAdd_omp(float* A,float* B, float* C,int size)
8 {
9 #pragma omp parallelfor
10 for(int i=0;i<size;i++)
11 C[i] = A[i] + B[i];
12 }
13
14 int main(int argc, char** argv)
15 {
16 int i;
17 int size = N *sizeof(float);
18
19 float *A,*B,*C;
20 A = (float*)malloc(size);
21 B = (float*)malloc(size);
22 C = (float*)malloc(size);
23
24 srand(2013);
25 for(i=0;i<N;i++)
26 {
27 A[i]=rand()%10;
28 B[i]=rand()%10;
29 }
30
31 VecAdd_omp(A, B, C, N);
32
33 for(i=0;i<N;i+=10000)
34 {
35 printf("%6d:%4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]);
36 }
37
38 free(A);
39 free(B);
40 free(C);
41 }
编译:icpc -O3 -openmp -mmic -ovectoradd_mic_native vectoradd_mic_native.cpp
运行:
scp/opt/intel/composer_xe_2013.0.079/compiler/lib/mic/libiomp5.so mic0:/tmp/
scpvectoradd_mic_native mic0:/tmp/
ssh mic0 (登录到MIC卡上)
cd /tmp
exportLD_LIBRARY_PATH=/tmp/ (设置lib路径,如果前面把libiomp5.so复杂到/lib64下面,这步可以省略)
./vectoradd_mic_native
结果:
0: 9.00+ 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
5. 向量加CPU+MIC offload并行程序
vectoradd_cpu_mic_offload.cpp
1 #include<mpi.h>
2 #include<omp.h>
3 #include<stdio.h>
4 #include<stdlib.h>
5
6 #define N200000
7
8 __attribute__((target(mic)))void offload_check(int rankID)
9 {
10 #ifdef __MIC__
11 printf("RankID%d running on MIC\n", rankID);
12 #else
13 printf("RankID%d running on host\n", rankID);
14 #endif
15 }
16
17 __attribute__((target(mic)))
18 void VecAdd_omp(float* A,float* B, float* C,int size)
19 {
20 #pragma omp parallelfor
21 for(int i=0;i<size;i++)
22 C[i] = A[i] + B[i];
23 }
24
25 int main(int argc, char** argv)
26 {
27 int i,M;
28 int myrank, root=0, totalrank;
29 MPI_Status status;
30
31 MPI_Init(&argc,&argv);
32 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
33 MPI_Comm_size(MPI_COMM_WORLD, &totalrank);
34
35 if(myrank == root)
36 printf("total rank is:%d\n",totalrank);
37 M = N / (totalrank-1);
38
39 if(myrank == root)
40 {
41 float *A, *B, *C;
42 int size = N *sizeof(float);
43 A = (float*)malloc(size);
44 B = (float*)malloc(size);
45 C = (float*)malloc(size);
46
47 srand(2013);
48 for(i=0;i<N;i++)
49 {
50 A[i]=rand()%10;
51 B[i]=rand()%10;
52 }
53
54 for(i=1;i<totalrank;i++)
55 {
56 MPI_Send(A+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD);
57 MPI_Send(B+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD);
58 }
59
60 for(i=1;i<totalrank;i++)
61 {
62 MPI_Recv(C+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD, &status);
63 }
64 for(i=0;i<N;i+=10000)
65 {
66 printf("%6d:%4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]);
67 }
68 free(A);
69 free(B);
70 free(C);
71 }
72 else
73 {
74 float *A, *B, *C;
75 int size = M *sizeof(float);
76 A = (float*)malloc(size);
77 B = (float*)malloc(size);
78 C = (float*)malloc(size);
79
80 MPI_Recv(A, M, MPI_FLOAT,0, myrank, MPI_COMM_WORLD, &status);
81 MPI_Recv(B, M, MPI_FLOAT,0, myrank, MPI_COMM_WORLD, &status);
82 if(myrank==1)//cpu
83 {
84 offload_check(myrank);
85 VecAdd_omp(A, B, C, M);
86 }
87 elseif(myrank==2)//mic
88 {
89 #pragma offload target(mic) in(A,B: length(M)) out(C: length(M))
90 {
91 offload_check(myrank);
92 VecAdd_omp(A, B, C, M);
93 }
94 }
95 MPI_Send(C, M, MPI_FLOAT,0, myrank, MPI_COMM_WORLD);
96 free(A);
97 free(B);
98 free(C);
99 }
100 MPI_Finalize();
101 }
编译:mpiicpc -O3 -openmp -ovectoradd_cpu_mic_offload vectoradd_cpu_mic_offload.cpp
运行:mpirun -np 3 ./vectoradd_cpu_mic_offload
// 进程0为主进程,进程1为CPU计算进程,进程2为MIC计算进程
结果:
total rank is:3
RankID 1 running on host
0: 9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
RankID 2 running on MIC
6. 向量加CPU+MIC Symmetric并行程序
vectoradd_cpu_mic_symmetric.cpp
1 #include<mpi.h>
2 #include<omp.h>
3 #include<stdio.h>
4 #include<stdlib.h>
5
6 #define N200000
7
8 void offload_check(int rankID)
9 {
10 #ifdef __MIC__
11 printf("RankID%d running on MIC\n", rankID);
12 #else
13 printf("RankID%d running on host\n", rankID);
14 #endif
15 }
16
17 void VecAdd_omp(float* A,float* B, float* C,int size)
18 {
19 #pragma omp parallelfor
20 for(int i=0;i<size;i++)
21 C[i] = A[i] + B[i];
22 }
23
24 int main(int argc, char** argv)
25 {
26 int i,M;
27 int myrank, root=0, totalrank;
28 MPI_Status status;
29
30 MPI_Init(&argc,&argv);
31 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
32 MPI_Comm_size(MPI_COMM_WORLD, &totalrank);
33
34 if(myrank == root)
35 printf("total rank is:%d\n",totalrank);
36 M = N / (totalrank-1);
37
38 if(myrank == root)
39 {
40 float *A, *B, *C;
41 int size = N *sizeof(float);
42 A = (float*)malloc(size);
43 B = (float*)malloc(size);
44 C = (float*)malloc(size);
45
46 srand(2013);
47 for(i=0;i<N;i++)
48 {
49 A[i]=rand()%10;
50 B[i]=rand()%10;
51 }
52
53 for(i=1;i<totalrank;i++)
54 {
55 MPI_Send(A+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD);
56 MPI_Send(B+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD);
57 }
58
59 for(i=1;i<totalrank;i++)
60 {
61 MPI_Recv(C+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD, &status);
62 }
63 for(i=0;i<N;i+=10000)
64 {
65 printf("%6d:%4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]);
66 }
67 free(A);
68 free(B);
69 free(C);
70 }
71 else
72 {
73 float *A, *B, *C;
74 int size = M *sizeof(float);
75 A = (float*)malloc(size);
76 B = (float*)malloc(size);
77 C = (float*)malloc(size);
78
79 MPI_Recv(A, M, MPI_FLOAT,0, myrank, MPI_COMM_WORLD, &status);
80 MPI_Recv(B, M, MPI_FLOAT,0, myrank, MPI_COMM_WORLD, &status);
81
82 offload_check(myrank);
83 VecAdd_omp(A, B, C, M);
84
85 MPI_Send(C, M, MPI_FLOAT,0, myrank, MPI_COMM_WORLD);
86
87 free(A);
88 free(B);
89 free(C);
90 }
91 MPI_Finalize();
92 }
编译:
mpiicpc -O3 -openmp -ovectoradd_cpu_mic_symmetric vectoradd_cpu_mic_symmetric.cpp
mpiicpc -O3 -openmp -mmic -ovectoradd_cpu_mic_symmetric.out vectoradd_cpu_mic_symmetric.cpp
运行:
scp vectoradd_cpu_mic_symmetric.outmic0:/tmp
scp/opt/intel/impi/4.1.0.024/mic/lib/*.so* mic0:/tmp
scp /opt/intel/impi/4.1.0.024/mic/bin/*mic0:/bin/
export MIC_LD_LIBRARY_PATH=/tmp
export I_MPI_MIC=enable
mpiexec.hydra -host 192.168.1.100 -n 2 ./vectoradd_cpu_mic_symmetric: -host 192.168.1.101 -n 1 -wdir /tmp /tmp/vectoradd_cpu_mic_symmetric.out
// 192.168.1.100为主机IP,192.168.1.101为MIC0 IP,MIC网络配置见附录。
// 进程0为主进程,进程1为CPU计算进程,进程2为MIC计算进程
结果:
total rank is:3
RankID 1 running on host
RankID 2 running on MIC
0: 9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
7. 附录
MIC局域网ip配置
首先确保一个软件包安装:
#rpm -qa | grep bridge-utils
然后关闭服务:
/etc/init.d/NetworkManager stop
chkconfig --level 345 NetworkManager off
改配置文件:注意备份eth1中的MAC地址!
cd /etc/sysconfig/network-scripts/
vim ifcfg-eth1
改为:
DEVICE="eth1"
NM_CONTROLLED="yes"
ONBOOT=yes
TYPE=Ethernet
BRIDGE=br0
HWADDR=6C:92:BF:00:43:CB
NAME="System eth1"
UUID=9c92fad9-6ecb-3e6c-eb4d-8a47c6f50c04
最后两行是无所谓的,有就留着,没有也可以不要。
创建ifcfg-br0并修改:
vim ifcfg-br0
DEVICE=br0
TYPE=Bridge
ONBOOT=yes
DELAY=0
NM_CONTROLLED="no"
MTU=9000
NOZEROCONF=yes
BOOTPROTO=static
IPADDR=192.168.1.100
NETMASK=255.255.255.0
IPADDR请手动指定,不要重复(留出MIC卡的地址)
重启网络服务:
service network restart
重启后如果ifconfig看到br0的地址是设定的地址,且eth1没有地址了,就是正确的。
设置MIC卡的IP地址:
首先停掉mpss服务:
service mpss stop
如果没有执行过micctrl –initdefaults,请执行。(不要重复执行!装卡的时候执行一次就够了!)
修改配置文件:
cd /etc/sysconfig/mic
vim default.conf
去掉#BridgeName micbr0 的“#”并改为刚才设置的bridge名称,如:
BridgeName br0
修改Subnet 172.31 字段为想设置的第一块MIC卡的IP地址,即主机端如果是192.168.1.100,这里设置为192.168.1.101:
Subnet 192.168.1.101
加上两行:
MTUsize 9000
NetBits 24
修改每块MIC卡的ip地址:
vim mic0.conf 添加:
MicIPaddress 192.168.1.101
vim mic1.conf 添加:
MicIPaddress 192.168.1.102
重新设置MIC参数:
micctrl –resetconfig
重启MIC服务
service mpss restart
如果配置了两个节点的话,节点之间就可以互相通信了。例如,以现在配好的为例:node01主机端192.168.1.100 ,两块卡分别是101、102。node02主机端192.168.1.103,两块卡分别是104、105。此时无论是卡还是host,都是可以ping通的,并且可以通过ssh互相登录。但是不同节点间登录还需要密码。
免密码方法:
创建ssh-keygen,书上有,不再赘述。
将本机的~/.ssh/ id_rsa.pub 或dsa,scp到远程主机并改名,例如改为id_rsa. node02.pub
cat id_rsa. node02.pub >> ~/.ssh/authorized_keys
此时就可以从本机免密码登录远程主机了。
然后关闭mpss服务,resetconfig,再重启mpss服务,就可以免密码登录远程mic卡了。
Bridge设置错误解决办法:
首先删掉或改名ifcfg-br0
编辑 /etc/udev/rules.d/70-persistent-net.rules注释掉后面不需要或MAC地址重复的项。
编辑ifcfg-eth1:
注释掉(用#号)BRIDGE=br0
添加BOOTPROTO=dhcp(或手动指定ip地址)
重启节点(似乎service network restart不管用)
重启后eth1应是直接连接路由器的ip。
这时恢复ifcfg-br0
编辑ifcfg-eth1,恢复BRIDGE并注释掉BOOTPROTO
重启节点即可。
附ifcfg-br0和ifcfg-eth1示例:
[root@ node01 network-scripts]# catifcfg-br0
DEVICE=br0
TYPE=Bridge
ONBOOT=yes
DELAY=0
NM_CONTROLLED="no"
MTU=9000
NOZEROCONF=yes
BOOTPROTO=static
IPADDR=192.168.1.100
NETMASK=255.255.255.0
[root@ node01 network-scripts]# catifcfg-eth1
DEVICE="eth1"
NM_CONTROLLED="yes"
ONBOOT=yes
DELAY=5
HWADDR=6C:92:BF:00:15:C5
TYPE=Ethernet
BRIDGE=br0
#BOOTPROTO=dhcp
#DEFROUTE=yes
#PEERDNS=yes
#PEERROUTES=yes
#IPV4_FAILURE_FATAL=yes
#IPV6INIT=no
MTU=9000
NAME="System eth1"
UUID=9c92fad9-6ecb-3e6c-eb4d-8a47c6f50c04
源代码见:
http://hpcbbs.it168.com/forum.php?mod=attachment&aid=MzIyMnwxNDUxZTBkY3wxMzYxNDM1NjAyfDIwNzYxMDIwfDczMzc%3D