Assembly code level optimization Technique

Optimizing at the code level is not effective as higher-level optimizations. Memory allocation optimizations, algorithmic optimizations should be tried before trying out the code level optimization. Code line level optimization is performed based on the assembly instruction created for one line of source code or one function. This needs to be done with the help of assembler and disassembler.

The output of the compiler (object file and executable) should conform to standard format so that it shall be run by the operating system. This format is called ELF (Executable and linking format) and all compilers output including FORTRAN, C, C++, etc will conform to this format. It defines how an executable file is organized so that loader knows where to look for code, data, etc.

Analyzing the assembly instructions is tricky and it requires knowledge about the assembly language. To get primitive understanding of how the C source code gets transformed to assembly language, you can use objdump tool

For example, consider the following program (fact.c)

#include <stdio.h>
int factorial(int number)
{
    if (number >12) return(1);
    int fact =1;
    int i =1;
    for (; i<=number; i++)
    {
        fact = fact * i;
    }
    return fact;
}
int main()
{
   printf(“%d\n”, factorial(12));
   return 0;
}

Compile the above code with debugging enabled(-g option)

$gcc -g -c fact.c
Objdump -S option displays both the source code and its corresponding assembly instructions together. This will help us to understand how C code is transformed to assembly by the compiler.

$objdump -S fact.o
fact.o:     file format elf32-i386
Disassembly of section .text:
00000000 <factorial>:
#include <stdio.h>
int factorial(int number)
{
   0:   55                      push   %ebp
   1:   89 e5                   mov    %esp,%ebp
   3:   83 ec 14                sub    $0x14,%esp
    if (number >12) return(1);
   6:   83 7d 08 0c             cmpl   $0xc,0x8(%ebp)
   a:   7e 09                   jle    15 <factorial+0x15>
   c:   c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
  13:   eb 2c                   jmp    41 <factorial+0x41>
    int fact =1;
  15:   c7 45 f8 01 00 00 00    movl   $0x1,0xfffffff8(%ebp)
    int i =1;
  1c:   c7 45 fc 01 00 00 00    movl   $0x1,0xfffffffc(%ebp)
    for (; i<=number; i++)
  23:   eb 0e                   jmp    33 <factorial+0x33>
    {
        fact = fact * i;
  25:   8b 45 f8                mov    0xfffffff8(%ebp),%eax
  28:   0f af 45 fc             imul   0xfffffffc(%ebp),%eax
  2c:   89 45 f8                mov    %eax,0xfffffff8(%ebp)
  2f:   83 45 fc 01             addl   $0x1,0xfffffffc(%ebp)
  33:   8b 45 fc                mov    0xfffffffc(%ebp),%eax
  36:   3b 45 08                cmp    0x8(%ebp),%eax
  39:   7e ea                   jle    25 <factorial+0x25>
    }
    return fact;
  3b:   8b 45 f8                mov    0xfffffff8(%ebp),%eax
  3e:   89 45 ec                mov    %eax,0xffffffec(%ebp)
  41:   8b 45 ec                mov    0xffffffec(%ebp),%eax
}
  44:   c9                      leave
  45:   c3                      ret

00000046 <main>:
int main()
{
  46:   8d 4c 24 04             lea    0x4(%esp),%ecx
  4a:   83 e4 f0                and    $0xfffffff0,%esp
  4d:   ff 71 fc                pushl  0xfffffffc(%ecx)
  50:   55                      push   %ebp
  51:   89 e5                   mov    %esp,%ebp
  53:   51                      push   %ecx
  54:   83 ec 14                sub    $0x14,%esp
   printf(“%d\n”, factorial(12));
  57:   c7 04 24 0c 00 00 00    movl   $0xc,(%esp)
  5e:   e8 fc ff ff ff          call   5f <main+0x19>
  63:   89 44 24 04             mov    %eax,0x4(%esp)
  67:   c7 04 24 00 00 00 00    movl   $0x0,(%esp)
  6e:   e8 fc ff ff ff          call   6f <main+0x29>
   return 0;
  73:   b8 00 00 00 00          mov    $0x0,%eax
}
  78:   83 c4 14                add    $0x14,%esp
  7b:   59                      pop    %ecx
  7c:   5d                      pop    %ebp
  7d:   8d 61 fc                lea    0xfffffffc(%ecx),%esp
  80:   c3                      ret

Even if you do not know assembly instructions, you can still use this technique to improve the code by counting the total number of assembly instructions produced for a function before and after your code changes.

Compiler(gcc) also provides -S option to stop processing after it produces an assembly file.

Consider the following program t1.c

#include<stdlib.h>
typedef struct element
{
    int              val;
    struct element  *next;
} Element;
Element *head;

void insert(int no)
{
    Element *newNode = (Element *) malloc( sizeof( Element ) );
    newNode->val = no;
    newNode->next = NULL;
    if (NULL == head) { head = newNode; return ;}
    Element *iter = head;
    while (NULL != iter->next) iter = iter->next;
    iter->next = newNode;
    return;
}
int main()
{
    insert(10);
    return 0;
}
Generate the assembly file for the above program as follows

$gcc -O2 -S t1.c
gcc will produce an assembly output file whose file name is the same as the original c file (t1 in this example) with a .s suffix. It will display a long output, so it is better to redirect it to a file.

The output file contains the following

        .file   “t1.c”
        .text
        .p2align 4,,15
.globl insert
        .type   insert, @function
insert:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        movl    $8, (%esp)
        call    malloc
        movl    head, %edx
        testl   %edx, %edx
        movl    %eax, %ecx
        movl    8(%ebp), %eax
        movl    $0, 4(%ecx)
        movl    %eax, (%ecx)
        jne     .L8
        jmp     .L11
        .p2align 4,,7
.L5:
        movl    %eax, %edx
.L8:
        movl    4(%edx), %eax
        testl   %eax, %eax
        jne     .L5
        movl    %ecx, 4(%edx)
        leave
        .p2align 4,,2
        ret
.L11:
        movl    %ecx, head
        leave
        ret
        .size   insert, .-insert
        .p2align 4,,15
.globl main
        .type   main, @function
main:
        leal    4(%esp), %ecx
        andl    $-16, %esp
        pushl   -4(%ecx)
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %ecx
        subl    $4, %esp
….
The insert function has approximately 25 instructions with three branches.

If the program implements simple linked list, then we shall modify the program as follows (t2.c)

#include<stdio.h>
#include<stdlib.h>
typedef struct element
{
    int              val;
    struct element  *next;
} Element;
Element *head;
void insert(int no)
{
    Element *newNode = (Element *) malloc( sizeof( Element ) );
    newNode->val = no;
    newNode->next = head;
    head = newNode;
    return;
}
int main()
{
    insert(10);
    return 0;
}
Generate the assembly instruction

$gcc -O2 -S t2.c
t2.s contains

        .file   “tt.c”
        .text
        .p2align 4,,15
.globl insert
        .type   insert, @function
insert:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        movl    $8, (%esp)
        call    malloc
        movl    8(%ebp), %edx
        movl    %edx, (%eax)
        movl    head, %edx
        movl    %edx, 4(%eax)
        movl    %eax, head
        leave
        ret
        .size   insert, .-insert
        .p2align 4,,15
.globl main
        .type   main, @function
main:
        leal    4(%esp), %ecx
        andl    $-16, %esp
        pushl   -4(%ecx)
        pushl   %ebp
        movl    %esp, %ebp

Now insert() function contains only 12 instructions, we have optimized the code to run faster without changing the behavior. This code changes cannot be made if main() assumes that the linked list retrieval is in FIFO fashion.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s