Assembly code level optimization Technique
Optimizing at the code level is not effective as higher-level optimizations. Memory allocation optimizations, algorithmic optimizations should be tried before trying out the code level optimization. Code line level optimization is performed based on the assembly instruction created for one line of source code or one function. This needs to be done with the help of assembler and disassembler.
The output of the compiler (object file and executable) should conform to standard format so that it shall be run by the operating system. This format is called ELF (Executable and linking format) and all compilers output including FORTRAN, C, C++, etc will conform to this format. It defines how an executable file is organized so that loader knows where to look for code, data, etc.
Analyzing the assembly instructions is tricky and it requires knowledge about the assembly language. To get primitive understanding of how the C source code gets transformed to assembly language, you can use objdump tool
For example, consider the following program (fact.c)
#include <stdio.h>
int factorial(int number)
{
if (number >12) return(1);
int fact =1;
int i =1;
for (; i<=number; i++)
{
fact = fact * i;
}
return fact;
}
int main()
{
printf(“%d\n”, factorial(12));
return 0;
}
Compile the above code with debugging enabled(-g option)
$gcc -g -c fact.c
Objdump -S option displays both the source code and its corresponding assembly instructions together. This will help us to understand how C code is transformed to assembly by the compiler.
$objdump -S fact.o
fact.o: file format elf32-i386
Disassembly of section .text:
00000000 <factorial>:
#include <stdio.h>
int factorial(int number)
{
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 83 ec 14 sub $0×14,%esp
if (number >12) return(1);
6: 83 7d 08 0c cmpl $0xc,0×8(%ebp)
a: 7e 09 jle 15 <factorial+0×15>
c: c7 45 ec 01 00 00 00 movl $0×1,0xffffffec(%ebp)
13: eb 2c jmp 41 <factorial+0×41>
int fact =1;
15: c7 45 f8 01 00 00 00 movl $0×1,0xfffffff8(%ebp)
int i =1;
1c: c7 45 fc 01 00 00 00 movl $0×1,0xfffffffc(%ebp)
for (; i<=number; i++)
23: eb 0e jmp 33 <factorial+0×33>
{
fact = fact * i;
25: 8b 45 f8 mov 0xfffffff8(%ebp),%eax
28: 0f af 45 fc imul 0xfffffffc(%ebp),%eax
2c: 89 45 f8 mov %eax,0xfffffff8(%ebp)
2f: 83 45 fc 01 addl $0×1,0xfffffffc(%ebp)
33: 8b 45 fc mov 0xfffffffc(%ebp),%eax
36: 3b 45 08 cmp 0×8(%ebp),%eax
39: 7e ea jle 25 <factorial+0×25>
}
return fact;
3b: 8b 45 f8 mov 0xfffffff8(%ebp),%eax
3e: 89 45 ec mov %eax,0xffffffec(%ebp)
41: 8b 45 ec mov 0xffffffec(%ebp),%eax
}
44: c9 leave
45: c3 ret
00000046 <main>:
int main()
{
46: 8d 4c 24 04 lea 0×4(%esp),%ecx
4a: 83 e4 f0 and $0xfffffff0,%esp
4d: ff 71 fc pushl 0xfffffffc(%ecx)
50: 55 push %ebp
51: 89 e5 mov %esp,%ebp
53: 51 push %ecx
54: 83 ec 14 sub $0×14,%esp
printf(“%d\n”, factorial(12));
57: c7 04 24 0c 00 00 00 movl $0xc,(%esp)
5e: e8 fc ff ff ff call 5f <main+0×19>
63: 89 44 24 04 mov %eax,0×4(%esp)
67: c7 04 24 00 00 00 00 movl $0×0,(%esp)
6e: e8 fc ff ff ff call 6f <main+0×29>
return 0;
73: b8 00 00 00 00 mov $0×0,%eax
}
78: 83 c4 14 add $0×14,%esp
7b: 59 pop %ecx
7c: 5d pop %ebp
7d: 8d 61 fc lea 0xfffffffc(%ecx),%esp
80: c3 ret
Even if you do not know assembly instructions, you can still use this technique to improve the code by counting the total number of assembly instructions produced for a function before and after your code changes.
Compiler(gcc) also provides -S option to stop processing after it produces an assembly file.
Consider the following program t1.c
#include<stdlib.h>
typedef struct element
{
int val;
struct element *next;
} Element;
Element *head;
void insert(int no)
{
Element *newNode = (Element *) malloc( sizeof( Element ) );
newNode->val = no;
newNode->next = NULL;
if (NULL == head) { head = newNode; return ;}
Element *iter = head;
while (NULL != iter->next) iter = iter->next;
iter->next = newNode;
return;
}
int main()
{
insert(10);
return 0;
}
Generate the assembly file for the above program as follows
$gcc -O2 -S t1.c
gcc will produce an assembly output file whose file name is the same as the original c file (t1 in this example) with a .s suffix. It will display a long output, so it is better to redirect it to a file.
The output file contains the following
.file “t1.c”
.text
.p2align 4,,15
.globl insert
.type insert, @function
insert:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
movl $8, (%esp)
call malloc
movl head, %edx
testl %edx, %edx
movl %eax, %ecx
movl 8(%ebp), %eax
movl $0, 4(%ecx)
movl %eax, (%ecx)
jne .L8
jmp .L11
.p2align 4,,7
.L5:
movl %eax, %edx
.L8:
movl 4(%edx), %eax
testl %eax, %eax
jne .L5
movl %ecx, 4(%edx)
leave
.p2align 4,,2
ret
.L11:
movl %ecx, head
leave
ret
.size insert, .-insert
.p2align 4,,15
.globl main
.type main, @function
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ecx
subl $4, %esp
….
The insert function has approximately 25 instructions with three branches.
If the program implements simple linked list, then we shall modify the program as follows (t2.c)
#include<stdio.h>
#include<stdlib.h>
typedef struct element
{
int val;
struct element *next;
} Element;
Element *head;
void insert(int no)
{
Element *newNode = (Element *) malloc( sizeof( Element ) );
newNode->val = no;
newNode->next = head;
head = newNode;
return;
}
int main()
{
insert(10);
return 0;
}
Generate the assembly instruction
$gcc -O2 -S t2.c
t2.s contains
.file “tt.c”
.text
.p2align 4,,15
.globl insert
.type insert, @function
insert:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
movl $8, (%esp)
call malloc
movl 8(%ebp), %edx
movl %edx, (%eax)
movl head, %edx
movl %edx, 4(%eax)
movl %eax, head
leave
ret
.size insert, .-insert
.p2align 4,,15
.globl main
.type main, @function
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
…
Now insert() function contains only 12 instructions, we have optimized the code to run faster without changing the behavior. This code changes cannot be made if main() assumes that the linked list retrieval is in FIFO fashion.
Code Level Optimization Techniques
Some of the techniques mentioned below “arithmetic operators”, “jump table to replace if…elseif”, “faster for loops” makes the code hard to debug, un-portable and un-maintainable. If there is no other way to optimize, then only attempt these techniques.
Use pass by reference for user defined type
Passing parameters by value to functions, result in the complete parameter being copied on to the stack. Pass by value shall be replaced by const references to avoid this data copy. Passing bigger objects as return values also has the same performance issues; instead replace it as OUT parameters for that function.
Pre-compute quantities to speed up run-time calculations
Avoid constant computing inside the loops. Though most of the compilers do this optimization, it is still better if it is handled in our code rather than relying on the compiler.
for (int i=0; i <100; i++) {
offset = offset + strlen(str);
}
Shall be replaced to
int size = strlen(str);
for (int i=0; i <100; i++) {
offset = offset + size;
}
Avoid system calls in time critical section of the code
System calls are costly as it involves switching into kernel mode from user mode. For instances where two process needs to communicate, using shared memory is better than using message queues of pipes as shared memory does not incur any system calls to send and receive data.
Lazy / Minimal Evaluation
Lazy or delayed evaluation is the technique of delaying a computation until the result of the computation is actually needed for that code path.
For example, if the expressions are concatenated using logical ‘AND’ or ‘OR’ operator, the expression is evaluated from left to right. We shall gain CPU cycles by placing the expressions in correct position
Consider the code example
if (strcmp(str, “SYSTEM”) && flag == 1)
{
//…
}
In the above code segment, strcmp() function takes lot of time and for cases where flag != 1, it still does the comparison wasting lot of CPU cycles. The efficient code is
if (flag ==1 && strcmp(str, “SYSTEM”))
{
//…
}
The same is applicable for OR logical operator when flag == 1. In general, for expressions using operator &&, make the condition that is most likely to be false the leftmost condition. For expressions using operator ||, make the condition that is most likely to be true the leftmost condition
Frequently executed cases should come first in the switch case or if..elseif statement so that the number of comparisons is reduced for the frequently executed case.
The following switch statement is inefficient as integer datatype, which is used most of the time, is placed in the last case and then string data type. Double data type is seldom used in most of the applications, but it is placed in the first case.
switch(dataType) {
case typeDouble: { doDoubleAction(); break; }
case typeDate: {doDateAction();; break; }
case typeShort: {doShortAction();.; break; }
case typeString: {doStringAction(); break; }
case typeTimeStamp: {doTimeStampAction(); break; }
case typeInt: { doIntAction(); break; }
default: {doDefaultAction(); break; }
}
This shall be made efficient by keeping the frequently used data types first and followed by least and seldom-used data types.
switch(dataType) {
case typeInt: {…; break; }
case typeString: {…; break; }
case typeShort: {…; break; }
case typeDouble: {…; break; }
case typeTimeStamp: {…; break; }
case typeDate: {…; break; }
default: {…; break; }
}
Much more elegant way is to implement a jump table using function pointers. Consider the following code
typedef void (*functs)();
functs JumpTable[] = { doIntAction, doStringAction(), doShortAction() /* etc*/} ;
Place your function pointers in the same sequence of dataType enum. The above JumpTable assumes that DataType enum is defined as follows
enum DataType{
typeInt =0,
typeString,
typeShort,
/* other types */
};
To call the appropriate implementation just use the following statement
JumpTable[dataType]();
Now the compare operations are replaced with array indexing operation, which are much faster. Moreover the time taken for every data type is nearly the same as compared to if..elseif and switch() constructs.
Minimize local variables
If the number of local variables in a function is less, then the compiler will be able to fit them into registers, thereby avoiding access to the memory (stack). If no local variables need to be saved on the stack, the compiler need not set up and restore the frame pointer.
Reduce number of parameters
Function calls with large number of parameters may be expensive due to large number of parameter pushes on stack on each call. This is applicable even when struct is passed by value.
Declare local functions as static
If the function is small enough, it may be inlined without having to maintain an external copy if declared as static.
Avoid using global variables in performance critical code
Global variables are never allocated to registers. Global variables can be changed by assigning them indirectly using a pointer, or by a function call. Hence, the compiler cannot cache the value of a global variable in a register, resulting in extra loads and stores when globals are used.
Pointer chains
Pointer chains are frequently used to access information in structures. For example, a common code sequence is:
typedef struct { int x, y, color; } Point;
typedef struct { Point *pos, int something; } Pixel;
void initPixel(Pixel*p)
{
p->pos->x = 0;
p->pos->y = 0;
p->pos->color =0;
}
However, this code must reload p->pos for each assignment. A better version would cache p->pos in a local variable:
void initPixel(Pixel *p)
{
Point *pos = p->pos;
pos->x = 0;
pos->y = 0;
pos->color = 0;
}
Another way is to include the Point structure in the Pixel structure, thereby avoiding pointers completely. Some compilers do provide this optimization by default.
Replace frequent read/write with mmap
If the application does lot of read/write calls, then you should seriously consider to convert it into mmap() . This does not incur data copy from kernel to user and vice versa and avoids kernel mode switch for executing the read() or write() system call. You can use fsync() call, once to flush all the data to the disk file after you write all your data into the mapped memory area.
Place elements together, which are accessed together
When you declare a structure make sure that the data elements are declared in such a way that most frequently accessed elements at the beginning
For example if page element is accessed many times and hasSpace little less and freeNode rarely, then declare the structure as follows.
Struct Node{
Int page;
Int hasSpace;
Int freeNode;
///other types
};
This improves the performance because L1 or L2 cache miss, will not get only the required byte, it also gets one full cache line size of data into the appropriate cache. When page is cached, it also caches hasSpace if cache line is 64 bit. L1 cache line size is usually 64 bits and for L2, it is 128 bits.
Arithmetic operator
Replacing multiplication and division by shift and add operations.
X * 2 shall be replaced to X+X or X<<1
X * N shall be replaced by X<<I where N is 2 ^ I
X / N shall be replaced by X >>I where N is 2 ^ I
Faster for() loops
for( i=0; i<10; i++){ … }
If the code inside the loop does not care about the order of the loop counter, we can do this instead:
for( i=10; i–; ) { … }
This runs faster because it is quicker to process i– as the test condition, which says “Is i non-zero? If so, decrement it and continue”. For the original code, the processor has to calculate “Subtract i from 10. Is the result non-zero? If so, increment i and continue.”
Prefer int over char and short
If you have an integer value that can fit in a byte, you should still consider using an int to hold the number. This is because, when you use a char, the compiler will first convert the values into integer, perform the operations and then convert back the result to char. Doing so, will increase the memory usage but decreases the CPU cycles
Advise OS on how it should manage memory pages
Using madvise() system call, we can specify how the swapping and prefetching should be handled by the operating system. If you are accessing data sequentially, OS can prefetch some pages and keep it ready before the program references it. If program accesses data in some memory location frequently, it shall be locked in physical memory avoiding page swap to disk, using mlock() system call.
Thread Design Techniques
A) Pipeline
Task is divided into multiple sub tasks and each thread is responsible for doing one subtask. Job is passed from one thread to other till it reaches the final state.
B) Master/Servant
Master thread does accept the work and delegates work to its servants. Servants are created on the fly by the master when a work arrives to the master. After finishing the job servants go away. For next work, master creates new servants.
C) Thread Pool
Master thread does accept the work and delegates work to predefined set of servants, called thread pool. This is usually accomplished by using a queuing mechanism in between. Master adds request to the queue and all the threads in the pool removes the request from the queue and handles it iteratively.
Porting Unix Socket code to Windows
Windows sockets and UNIX type berkeley sockets provide pretty much similar interface which eases the porting effort. Apart from regular network system calls such as socket(), bind(), listen(), accept(),etc two API calls are important in Windows sockets
WSAStartup()-> should be called before calling any other winsock APIs
WSACleanup()-> should be called when program exit
After a socket connection is established, data can be transferred using the standard read() and write() calls as in UNIX sockets. Sockets must be closed by using the closesocket() function in Windows instead of close() in case of Windows.
Header File:
winsock2.h
Library:
Ws2_32.lib
Alternatively, you can add below line to header file
#pragma comment(lib, “Ws2_32.lib”);
More information on this topic can be found here @ msdn
Network Debugging Utility – netstat
netstat is a useful tool for checking your network configuration and statistics.
When invoked with the –i flag, it displays statistics for the network interfaces currently configured.
Output
Kernel Interface table
Iface MTU Met RX-OK RX-ERR RX-DRPRX-OVR TX-OK TX-ERRTX-DRPTX-OVR Flg
eth0 1500 0 245257 0 0 0 118056 0 0 0 BMRU
lo 16436 0 23632 0 0 0 23632 0 0 0 LRU
The MTU and Met fields show the current MTU and metric values for that interface. The RX and TX columns show how many packets have been received or transmitted error-free (RX-OK/TX-OK) or damaged (RX-ERR/TX-ERR); how many were dropped (RX-DRP/TX-DRP); and how many were lost because of an overrun (RX-OVR/TX-OVR).
When invoked with –a flag, netstat displays all the active internet socket connections
Output
Active Internet connections (servers and established)
Proto Recv-Q Send-Q Local Address Foreign Address State
tcp 0 0 prabakaran:irdmi *:* LISTEN
tcp 0 0 *:43812 *:* LISTEN
tcp 0 0 *:mysql *:* LISTEN
tcp 0 0 *:sunrpc *:* LISTEN
tcp 0 0 prabakaran:ncube-lm prabakaran:49594 ESTABLISHED
tcp 0 0 prabakaran:49594 prabakaran:ncube-lm ESTABLISHED
udp 0 0 prabakaran:filenet-cm *:*
…
Active UNIX domain sockets (servers and established)
Proto RefCnt Flags Type State I-Node Path
unix 2 [ ACC ] STREAM LISTENING 5624 @/tmp/fam-root-
unix 2 [ ACC ] STREAM LISTENING 5039 /var/run/cups/cups.sock
unix 2 [ ACC ] STREAM LISTENING 4986 /var/run/avahi-daemon/socket
unix 2 [ ACC ] STREAM LISTENING 6130 /tmp/.X11-unix/X0
where
Proto
specifies the protocol used by the socket(tcp, upd or raw)
Recv-Q
Total bytes not copied by the user program connected to this socket.
Send-Q
Total bytes not acknowledged by the remote host.
Local Address
Address and port number of the local end of the socket.
Foreign Address
Address and port number of the remote end of the socket.
State
It represents Socket state and is applicable only for TCP sockets. Possible values are
ESTABLISHED
The socket has an established connection.
SYN_SENT
The socket is actively attempting to establish a connection.
SYN_RECV
A connection request has been received from the network.
FIN_WAIT1
The socket is closed, and the connection is shutting down.
FIN_WAIT2
Connection is closed, and socket is waiting for a shutdown from the remote end.
TIME_WAIT
The socket is waiting after close to handle packets still in the network.
CLOSED
The socket is not being used.
CLOSE_WAIT
The remote end has shut down, waiting for the socket to close.
LAST_ACK
The remote end has shut down, and socket is closed but waiting for acknowledgement.
LISTEN
The socket is listening for incoming connections.
CLOSING
Both sockets are shut down but we still don’t have all our data sent.
UNKNOWN
The state of the socket is unknown.
When invoked with –p option, it displays the process ID and executable file name for all sockets.
Output
Active Internet connections (w/o servers)
Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
tcp 0 0 prabakaran:ncube-lm prabakaran:49594 ESTABLISHED 1750/tnslsnr
…..
This option will help to know the process which owns the socket when bind() call returns “address already in use” for the port.
When invoked with –s option, it displays statistics of TCP, UDP, IP protocol such as total number of active and passive connections for TCP, failed connection attempts for TCP, established connections for TCP, total TCP segments received and sent, total UDP packets received and sent, packet receive errors for UDP, etc.
Inspecting What is inside executable or library
Sometimes, it is necessary to know what dynamic libraries an executable depends on and what functions are defined in particular library, etc. In this blog, some tools which aid understanding the content of executable / library is discussed.
size
Size is a diagnostic tool for executable file analysis. This utility displays the total size for each object file. The running program of the computer is called a process. Each process of the computer allocates some memory from RAM known as process memory. The process memory of the computer is divided into different blocks segment such as code, data, heap and stack.
Ex.
int x=100; // data
int y; // bss
void main()
{
static int a=10; // data
static int b; // bss
}
In the above program x and y are external variables where as a and b are static variable. All external and initialize static and external variable are created in Data Segment but uninitialized static and external variables are created in bss (uninitialized data segment) segment. To view the memory allocation by each of these segment we can use the size command.
$ size ./a.out
output:
data bss dec hex filename
252 8 1319 527 a.out
You can type the object files name to be examined. If none are specified, the file "a.out" will be used.
options
-d The total size is always displayed in decimal format.
-o The total size is always displayed in octal format.
$ size -x ./a.out
text data bss dec hex filename
0x3b7 0×100 0×10 1223 4c7 ./a.out
$ size -o ./a.out
text data bss oct hex filename
01667 0400 020 2307 4c7 ./a.out
ldd
ldd is a diagnostic tool for executable file analysis. It shows which shared libraries an executable would use in your environment. A C program is nothing but collection of some functions. Every function is defined in a library file (static/dynamic library). When a program is loaded along with that the respective dynamic libraries are loaded. To enhance yourself with the dynamic libraries used in your program you are equipped with a development tool called ldd. It gives you brief details about which dynamic library is loaded & which dynamic library is missing in your program.
Ex.
main()
{
printf(“Hello”);
bbsr();
}
Here main.c is the source file. On compiling generates the object file ( main.o). Now the object file is linked with the dynamic libraries (libc.so, sample.so) to build the executable file(singo).
For Example:
$ldd ./singo
Output :
linux-gate.so.1 => (0×00110000)
sample.so=> not found
libc.so.6 => /lib/libc.so.6 (0×00367000)
/lib/ld-linux.so.2 (0×00348000)
nm
This command list symbol names from object file. These symbol names can be either functions, global variables or static variables. For each symbol the value, symbol type & symbol names are displayed. Lower case symbol types means the symbol is local , upper case means the symbol is global.
Options:
-S print size of undefined symbols
-D Print dynamic, not normal, symbols. Useful only when working with dynamic objects (for example some kinds of shared libraries).
-l For each symbol, use debugging information to try to find a file name and line number. For a defined symbol, look for the line number of the address of the symbol. For an undefined symbol, look for the line number of a relocation entry which refers to the symbol. If line number information can be found, print it after the other symbol information.
Ex.
//prg.c
main()
{
bbsr();
printf(“Hello”);
ctc();
}
int ctc()
{
printf(“I am incuttack”);
}
$ gcc -c prg.c
$nm prg.o
output:
U bbsr
00000030 T ctc
00000000 T main
U printf
Here in the above output printf() & bbsr() are undefined.
$ nm -D -l -S ./a.out
080484b8 00000004 R _IO_stdin_used
w __gmon_start__
U __libc_start_main
U printf
objdump
It displays information about object files.
This command is used to disassemble shared objects & libraries. It locates the method in which the problem originates.
Options:
-d Display the assembler mnemonics for the machine instructions from object file. This option disassembles only those sections that are expected to contain instructions.
-s Display the full contents of any section requested.
-h Display summary information from the section headers of the object file.
using objdump -h to list the file section headers can’t show the correct addresses. Instead, it shows the usual addresses, which are implicit for the target.
Example
//prg1.c
main()
{
static int x=12;
static int y;
printf(“%d”,x);
bbsr();
}
int bbsr()
{
printf(“Hi”);
}
$objdump -h ./a.out
output:
./a.out: file format elf32-i386
Sections:
Idx Name Size VMA LMA File off Algn
0 .interp 00000013 08048134 08048134 00000134 2**0
CONTENTS, ALLOC, LOAD, READONLY, DATA
1 .note.ABI-tag 00000020 08048148 08048148 00000148 2**2
CONTENTS, ALLOC, LOAD, READONLY, DATA
2 .note.gnu.build-id 00000024 08048168 08048168 00000168 2**2
CONTENTS, ALLOC, LOAD, READONLY, DATA
…
$ objdump -s -j .rowdata ./a.out
output:
a.out: file format elf32-i386
$ objdump -d -r -j .text ./a.out
output:
./a.out: file format elf32-i386
Disassembly of section .text:
080482f0 <_start>:
80482f0: 31 ed xor %ebp,%ebp
80482f2: 5e pop %esi
80482f3: 89 e1 mov %esp,%ecx
80482f5: 83 e4 f0 and $0xfffffff0,%esp
80482f8: 50 push %eax
…
The above output is the disassembly of text section with the -d flag. This option disassembles the section which are expected to contains the instructions.
How to analyze issue without source code
Run time tracing allows the debugger to identify the problem on executable. This technique shall also be used to understand the code flow of the program. There are tools which traces all system calls or library routines.
strace utility provides the capability to trace the execution of an application from the perspective of system calls. Along with the system call routine, it also displays their arguments, return value, etc
Linux permits to access 4 GB virtual address space for a process and that 4 GB virtual space is divided into two parts such as user space ( 3 GB) and kernel space (1 GB). When we run any application program along with library , they allocates memory from user space . Here in the following example printf is a library function and internally uses a standard library libc.so. The function printf writes data “Hello” into monitor , but how it happens.
Ex.
void main()
{
printf(“Hello”);
}
Operating system uses system call to copy the data from user space to kernel space and write into device file which is associated with a device ( example stdout for monitor) . The driver known as modules works in kernel space which reads the data from a device file and write into the port .
To list out the system call maintained by o/s there is a development tool called strace .Ittrace the system calls and signals . It intercepts and records the system calls made by a running process. strace can print a record of each system call, its arguments, and its return value. strace is a system call tracer, i.e. a debugging tool which prints out a trace of all the system calls .
$strace ./singo
output:
open(“/etc/ld.so.cache”, O_RDONLY) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=111023, …}) = 0
mmap2(NULL, 111023, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb7f91000
close(3) = 0
open(“/lib/libc.so.6″, O_RDONLY) = 3
read(3, “\177ELF\1\1\1\3\3\1\360\3247004″…, 512) = 512
write(1, “Hello”, 5Hello) = 5
Each line starts with a system call name, is followed by its arguments in the parenthesis & then has return value at the end of the line. The important system call made in the above output are open,close,read,write. The last line write(1, “Hello”, 5Hello) = 5 line corresponding to the system call that cause Hello to be printed on the screen. The first argument -1 is the file descriptor of the file i.e. stdout to write to. So it writes the message on the terminal instead of writing in some other file on disk. The second argument tells the data is a string & third argument display the total number of character in the string.
$ strace -c ./singo
Hello% time seconds usecs/call calls errors syscall
—— ———– ———– ——— ——— —————-
nan 0.000000 0 1 read
nan 0.000000 0 1 write
nan 0.000000 0 2 open
nan 0.000000 0 2 close
nan 0.000000 0 1 execve
nan 0.000000 0 1 1 access
nan 0.000000 0 1 brk
nan 0.000000 0 1 munmap
nan 0.000000 0 2 mprotect
nan 0.000000 0 7 mmap2
nan 0.000000 0 3 fstat64
nan 0.000000 0 1 set_thread_area
—— ———– ———– ——— ——— —————-
100.00 0.000000 23 1 total
From the above output, it is evident that write system call is used 1time. This is initiated by printf() statement on stdout file descriptor.
One another important option of strace is -tt, it causes strace to print out the time at which each call finished.
Library Tracing with ltrace
ltrace allows tracing any library function calls. It traces not only libc.so, but also any third party libraries.
To run ltrace on the executable ‘test’ created in the previous section, run
$ltrace ./test
It displays the following
__libc_start_main(0x80484c4, 1, 0xbfea17f4, 0×8048570, 0×8048560 <unfinished …>
fopen(“info.txt”, “r”) = 0x8a03008
fgets(“1\n”, 80, 0x8a03008) = 0xbfea16fc
printf(“Line %d: %s”, 4568017, “1\n”Line 4568017: 1
) = 16
—more output—-
fgets(“10\n”, 80, 0x8a03008) = 0xbfea16fc
printf(“Line %d: %s”, 4568026, “10\n”Line 4568026: 10
) = 17
fgets(“10\n”, 80, 0x8a03008) = NULL
fclose(0x8a03008) = 0
+++ exited (status 0) +++
Without the source code, we can understand that fgets and printf functions are used heavily by this executable. We can try to optimize the code to reduce cpu cycles.
Most of the option present in strace is also present in ltrace. For the complete set of options run `man ltrace`.
Browsing Large source base using cscope
cscope is a developer’s tool for browsing source code. It allows searching code for: all references to a symbol, global definitions, functions etc.
Here is a simple example to illustrate the problem:
main()
{
FILE *p;
va_list p;
}
It’s quite difficult for a beginner to understand what is FILE & va_list. Simply you are made to know that p is a pointer to the file structure & va_list creates internally a variable argument list. But do you feel this information to be sufficient enough to clarify the doubt. Absolutely not, there you should surely go for the cscope.
cscope is used in two phases. First a developer builds the cscope database using find command to get the list of filenames that need to index into a file called cscope files.
Syntax:
$ find /usr/include/ -name “*.h”>myfile
This will include all header files to generate a database.
Second, the developer builds symbol database using cscope by specifying the file name list created in the above stop.
$ cscope -i myfile
This will take some time to build the symbol database & cscope will display the following input menu. Use up and down arrow key to move to the desired input menu.
Find this C symbol:
Find this global definition:
Find functions called by this function:
Find functions calling this function:
Find this text string:
Change this text string:
Find this egrep pattern:
Find this file:
Once the cscope cross reference database is built (usually named cscope.out and is generated in the current directory where cscope is run), you can search the database using ‘-d’ option of cscope. If you change the source files, then you may have to rebuild the symbol database.
Note: Make sure that when you are running cscope with –d option, cscope.out (symbol database) is present in the directory where are executing the cscope command.
Working with cscope
Open two terminals, one for writing the program & another to browse the code.
Invoke cscope in the second terminal, it will display two paragraphs, the top paragraph displays the results and bottom paragraph displays the options and accepts input from the user. Type va_list under “Find this global definition” menu as show below.
It will display the following
File Line
0 slang.h 1012 SL_EXTERN void (*SLang_VMessage_Hook) (char *, va_list);
1 slang.h 1019 SL_EXTERN void (*SLang_Exit_Error_Hook)(char *, va_list);
2 sqlite3ext.h 151 char *(*vmprintf)(const char *,va_list);
3 stdio.h 80 typedef _G_va_list va_list;
4 tiffio.h 259 typedef void (*TIFFErrorHandler)(const char *, const char *,
va_list);
* 3 more lines – press the space bar to display more *
Find this C symbol:
Find this global definition: va_list
Find functions called by this function:
Find functions calling this function:
Find this text string:
Change this text string:
Find this egrep pattern:
Find this file:
The top paragraph which displays the result, contains identifiers for each result line. In our case, it displays numbers 0 to 4. The output also contains file name, line number and the exact string which matched the input string specified. By pressing the appropriate key (in our case 0 to 4), automatically vi opens that file. When you exit from the vi editor using “:q” you will be taken back to the cscope window. Press CTRL-D to exit from the cscope window.
Commands console to deal with cscope
:q – To go back to the menu.
Ctrl+d – To stop cscope or change the marked lines and exit
Up-arrow & down arrow – To move the cursor in the menu list.
Tab – For moving the cursor from upper paragraph to lower paragraph.
Space bar -To display more lines
0-9a-zA-Z – To edit the file referenced by the given identifier in the top output paragraph
or to mark or unmark lines to be changed
+ – Displays next set of matching lines
- – Displays previous set of matching lines
* – Mark or unmark all displayed lines to be changed
Esc – Exit without changing the marked line.
Advantages of cscope
It is very useful when working on large project which contain thousands of source code files. It reduces the development effort by multi fold and increases the productivity.
- While development, it is difficult for developers to remember all the structure names and its members. Often we search the required header file to know the members of a structure. This can be done easily using cscope
- Often we need to know, before modifying a function (adding new arguments, changing implementation, etc), we need to make sure that all the callers of that function do not get affected by this modification. To do this we need to know first what functions are calling these functions. cscope tool comes handy to reduce your effort
- When we are changing MACRO name or function name, all the files which references that macro or calls that function needs to be changed. With cscope you can do this much faster than doing it manually by editing each and every source file after searching that macro.
- It helps maintenance engineers to understand the code flow of huge source base written by somebody else. If this tool is used together with “gdb”, it provides both static and run time code flow of the part of the code under investigation for bug analysis. This reduces the bug analysis and fixing time for maintenance engineers drastically
Computer Science Video Lectures
Collection of video lecture links related to computer science subjects
BSD Unix ( History, File system, Internals, etc)
Harvard University (CS50 – Introduction to computer science)
Center for IT Research in the interest of society
Others
Harvard University ( Justice Series: What is the right thing to do?- Michael Sandel)
