c代码优化
如下函数功能是实现拷贝并转置一个N*N的矩阵
void transpose(int *dst, int *src, int dim)
{
int i,j;
for(i = 0;i < dim;i++)
for(j = 0; j < dim;j++)
dst[j*dim + i] = src[i*dim +j];
}
我用gcc不带-O2参数生成的汇编代码如下:
transpose:
pushl %ebp
movl %esp, %ebp
subl $16, %esp
movl $0, -8(%ebp)
jmp .L2
.L3:
movl $0, -4(%ebp)
jmp .L4
.L5:
movl -4(%ebp), %eax
imull 16(%ebp), %eax
addl -8(%ebp), %eax
sall $2, %eax
movl %eax, %edx
addl 8(%ebp), %edx
movl -8(%ebp), %eax
imull 16(%ebp), %eax
addl -4(%ebp), %eax
sall $2, %eax
addl 12(%ebp), %eax
movl (%eax), %eax
movl %eax, (%edx)
addl $1, -4(%ebp)
.L4:
movl -4(%ebp), %eax
cmpl 16(%ebp), %eax
jl .L5
addl $1, -8(%ebp)
.L2:
movl -8(%ebp), %eax
cmpl 16(%ebp), %eax
jl .L3
leave
ret
从上可知变量i,j并没有存放于寄存器,而是存储于堆栈中。从而增加了加载和存储开销,
请问有没有办法通过优化C代码而改变这种状况?
还望各位高人指教,小弟在这里谢了!