编译器真的好聪明好聪明耶
看来我不用自己学写汇编来优化SIMD操作了?
简单的向量相加:
typedef struct { float x; float y; float z; float w;} vec4f;typedef struct { double x; double y; double z; double w;} vec4d;void vec4f_add(vec4f* restrict a, vec4f* restrict b) { a->x += b->x; a->y += b->y; a->z += b->z; a->w += b->w;}void vec4d_add(vec4d* restrict a, vec4d* restrict b) { a->x += b->x; a->y += b->y; a->z += b->z; a->w += b->w;}vec.o: file format elf64-x86-64Disassembly of section .text:0000000000000000 <vec4f_add>: 0: c5 f8 10 0f vmovups (%rdi),%xmm1 4: c5 f8 10 06 vmovups (%rsi),%xmm0 8: c5 f0 58 c0 vaddps %xmm0,%xmm1,%xmm0 c: c5 f8 11 07 vmovups %xmm0,(%rdi) 10: c3 retq 11: 66 66 66 66 66 66 2e data32 data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) 18: 0f 1f 84 00 00 00 00 1f: 00 0000000000000020 <vec4d_add>: 20: c5 f9 10 0f vmovupd (%rdi),%xmm1 24: c5 f9 10 06 vmovupd (%rsi),%xmm0 28: c4 e3 75 18 4f 10 01 vinsertf128 $0x1,0x10(%rdi),%ymm1,%ymm1 2f: c4 e3 7d 18 46 10 01 vinsertf128 $0x1,0x10(%rsi),%ymm0,%ymm0 36: c5 f5 58 c0 vaddpd %ymm0,%ymm1,%ymm0 3a: c5 f9 11 07 vmovupd %xmm0,(%rdi) 3e: c4 e3 7d 19 47 10 01 vextractf128 $0x1,%ymm0,0x10(%rdi) 45: c5 f8 77 vzeroupper 48: c3 retq