Assembly code level optimization Technique
Optimizing at the code level is not effective as higher-level optimizations. Memory allocation optimizations, algorithmic optimizations should be tried before trying out the code level optimization. Code line level optimization is performed based on the assembly instruction created for one line of source code or one function. This needs to be done with the help of assembler and disassembler.
The output of the compiler (object file and executable) should conform to standard format so that it shall be run by the operating system. This format is called ELF (Executable and linking format) and all compilers output including FORTRAN, C, C++, etc will conform to this format. It defines how an executable file is organized so that loader knows where to look for code, data, etc.
Analyzing the assembly instructions is tricky and it requires knowledge about the assembly language. To get primitive understanding of how the C source code gets transformed to assembly language, you can use objdump tool
For example, consider the following program (fact.c)
#include <stdio.h>
int factorial(int number)
{
if (number >12) return(1);
int fact =1;
int i =1;
for (; i<=number; i++)
{
fact = fact * i;
}
return fact;
}
int main()
{
printf(“%d\n”, factorial(12));
return 0;
}
Compile the above code with debugging enabled(-g option)
$gcc -g -c fact.c
Objdump -S option displays both the source code and its corresponding assembly instructions together. This will help us to understand how C code is transformed to assembly by the compiler.
$objdump -S fact.o
fact.o: file format elf32-i386
Disassembly of section .text:
00000000 <factorial>:
#include <stdio.h>
int factorial(int number)
{
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 83 ec 14 sub $0×14,%esp
if (number >12) return(1);
6: 83 7d 08 0c cmpl $0xc,0×8(%ebp)
a: 7e 09 jle 15 <factorial+0×15>
c: c7 45 ec 01 00 00 00 movl $0×1,0xffffffec(%ebp)
13: eb 2c jmp 41 <factorial+0×41>
int fact =1;
15: c7 45 f8 01 00 00 00 movl $0×1,0xfffffff8(%ebp)
int i =1;
1c: c7 45 fc 01 00 00 00 movl $0×1,0xfffffffc(%ebp)
for (; i<=number; i++)
23: eb 0e jmp 33 <factorial+0×33>
{
fact = fact * i;
25: 8b 45 f8 mov 0xfffffff8(%ebp),%eax
28: 0f af 45 fc imul 0xfffffffc(%ebp),%eax
2c: 89 45 f8 mov %eax,0xfffffff8(%ebp)
2f: 83 45 fc 01 addl $0×1,0xfffffffc(%ebp)
33: 8b 45 fc mov 0xfffffffc(%ebp),%eax
36: 3b 45 08 cmp 0×8(%ebp),%eax
39: 7e ea jle 25 <factorial+0×25>
}
return fact;
3b: 8b 45 f8 mov 0xfffffff8(%ebp),%eax
3e: 89 45 ec mov %eax,0xffffffec(%ebp)
41: 8b 45 ec mov 0xffffffec(%ebp),%eax
}
44: c9 leave
45: c3 ret
00000046 <main>:
int main()
{
46: 8d 4c 24 04 lea 0×4(%esp),%ecx
4a: 83 e4 f0 and $0xfffffff0,%esp
4d: ff 71 fc pushl 0xfffffffc(%ecx)
50: 55 push %ebp
51: 89 e5 mov %esp,%ebp
53: 51 push %ecx
54: 83 ec 14 sub $0×14,%esp
printf(“%d\n”, factorial(12));
57: c7 04 24 0c 00 00 00 movl $0xc,(%esp)
5e: e8 fc ff ff ff call 5f <main+0×19>
63: 89 44 24 04 mov %eax,0×4(%esp)
67: c7 04 24 00 00 00 00 movl $0×0,(%esp)
6e: e8 fc ff ff ff call 6f <main+0×29>
return 0;
73: b8 00 00 00 00 mov $0×0,%eax
}
78: 83 c4 14 add $0×14,%esp
7b: 59 pop %ecx
7c: 5d pop %ebp
7d: 8d 61 fc lea 0xfffffffc(%ecx),%esp
80: c3 ret
Even if you do not know assembly instructions, you can still use this technique to improve the code by counting the total number of assembly instructions produced for a function before and after your code changes.
Compiler(gcc) also provides -S option to stop processing after it produces an assembly file.
Consider the following program t1.c
#include<stdlib.h>
typedef struct element
{
int val;
struct element *next;
} Element;
Element *head;
void insert(int no)
{
Element *newNode = (Element *) malloc( sizeof( Element ) );
newNode->val = no;
newNode->next = NULL;
if (NULL == head) { head = newNode; return ;}
Element *iter = head;
while (NULL != iter->next) iter = iter->next;
iter->next = newNode;
return;
}
int main()
{
insert(10);
return 0;
}
Generate the assembly file for the above program as follows
$gcc -O2 -S t1.c
gcc will produce an assembly output file whose file name is the same as the original c file (t1 in this example) with a .s suffix. It will display a long output, so it is better to redirect it to a file.
The output file contains the following
.file “t1.c”
.text
.p2align 4,,15
.globl insert
.type insert, @function
insert:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
movl $8, (%esp)
call malloc
movl head, %edx
testl %edx, %edx
movl %eax, %ecx
movl 8(%ebp), %eax
movl $0, 4(%ecx)
movl %eax, (%ecx)
jne .L8
jmp .L11
.p2align 4,,7
.L5:
movl %eax, %edx
.L8:
movl 4(%edx), %eax
testl %eax, %eax
jne .L5
movl %ecx, 4(%edx)
leave
.p2align 4,,2
ret
.L11:
movl %ecx, head
leave
ret
.size insert, .-insert
.p2align 4,,15
.globl main
.type main, @function
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ecx
subl $4, %esp
….
The insert function has approximately 25 instructions with three branches.
If the program implements simple linked list, then we shall modify the program as follows (t2.c)
#include<stdio.h>
#include<stdlib.h>
typedef struct element
{
int val;
struct element *next;
} Element;
Element *head;
void insert(int no)
{
Element *newNode = (Element *) malloc( sizeof( Element ) );
newNode->val = no;
newNode->next = head;
head = newNode;
return;
}
int main()
{
insert(10);
return 0;
}
Generate the assembly instruction
$gcc -O2 -S t2.c
t2.s contains
.file “tt.c”
.text
.p2align 4,,15
.globl insert
.type insert, @function
insert:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
movl $8, (%esp)
call malloc
movl 8(%ebp), %edx
movl %edx, (%eax)
movl head, %edx
movl %edx, 4(%eax)
movl %eax, head
leave
ret
.size insert, .-insert
.p2align 4,,15
.globl main
.type main, @function
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
…
Now insert() function contains only 12 instructions, we have optimized the code to run faster without changing the behavior. This code changes cannot be made if main() assumes that the linked list retrieval is in FIFO fashion.
No comments yet.