Assembly code level optimization Technique

Optimizing at the code level is not effective as higher-level optimizations. Memory allocation optimizations, algorithmic optimizations should be tried before trying out the code level optimization. Code line level optimization is performed based on the assembly instruction created for one line of source code or one function. This needs to be done with the help of assembler and disassembler.

The output of the compiler (object file and executable) should conform to standard format so that it shall be run by the operating system. This format is called ELF (Executable and linking format) and all compilers output including FORTRAN, C, C++, etc will conform to this format. It defines how an executable file is organized so that loader knows where to look for code, data, etc.

Analyzing the assembly instructions is tricky and it requires knowledge about the assembly language. To get primitive understanding of how the C source code gets transformed to assembly language, you can use objdump tool

For example, consider the following program (fact.c)

#include <stdio.h>
int factorial(int number)
{
    if (number >12) return(1);
    int fact =1;
    int i =1;
    for (; i<=number; i++)
    {
        fact = fact * i;
    }
    return fact;
}
int main()
{
   printf(“%d\n”, factorial(12));
   return 0;
}

Compile the above code with debugging enabled(-g option)

$gcc -g -c fact.c
Objdump -S option displays both the source code and its corresponding assembly instructions together. This will help us to understand how C code is transformed to assembly by the compiler.

$objdump -S fact.o
fact.o:     file format elf32-i386
Disassembly of section .text:
00000000 <factorial>:
#include <stdio.h>
int factorial(int number)
{
   0:   55                      push   %ebp
   1:   89 e5                   mov    %esp,%ebp
   3:   83 ec 14                sub    $0x14,%esp
    if (number >12) return(1);
   6:   83 7d 08 0c             cmpl   $0xc,0x8(%ebp)
   a:   7e 09                   jle    15 <factorial+0x15>
   c:   c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
  13:   eb 2c                   jmp    41 <factorial+0x41>
    int fact =1;
  15:   c7 45 f8 01 00 00 00    movl   $0x1,0xfffffff8(%ebp)
    int i =1;
  1c:   c7 45 fc 01 00 00 00    movl   $0x1,0xfffffffc(%ebp)
    for (; i<=number; i++)
  23:   eb 0e                   jmp    33 <factorial+0x33>
    {
        fact = fact * i;
  25:   8b 45 f8                mov    0xfffffff8(%ebp),%eax
  28:   0f af 45 fc             imul   0xfffffffc(%ebp),%eax
  2c:   89 45 f8                mov    %eax,0xfffffff8(%ebp)
  2f:   83 45 fc 01             addl   $0x1,0xfffffffc(%ebp)
  33:   8b 45 fc                mov    0xfffffffc(%ebp),%eax
  36:   3b 45 08                cmp    0x8(%ebp),%eax
  39:   7e ea                   jle    25 <factorial+0x25>
    }
    return fact;
  3b:   8b 45 f8                mov    0xfffffff8(%ebp),%eax
  3e:   89 45 ec                mov    %eax,0xffffffec(%ebp)
  41:   8b 45 ec                mov    0xffffffec(%ebp),%eax
}
  44:   c9                      leave
  45:   c3                      ret

00000046 <main>:
int main()
{
  46:   8d 4c 24 04             lea    0x4(%esp),%ecx
  4a:   83 e4 f0                and    $0xfffffff0,%esp
  4d:   ff 71 fc                pushl  0xfffffffc(%ecx)
  50:   55                      push   %ebp
  51:   89 e5                   mov    %esp,%ebp
  53:   51                      push   %ecx
  54:   83 ec 14                sub    $0x14,%esp
   printf(“%d\n”, factorial(12));
  57:   c7 04 24 0c 00 00 00    movl   $0xc,(%esp)
  5e:   e8 fc ff ff ff          call   5f <main+0x19>
  63:   89 44 24 04             mov    %eax,0x4(%esp)
  67:   c7 04 24 00 00 00 00    movl   $0x0,(%esp)
  6e:   e8 fc ff ff ff          call   6f <main+0x29>
   return 0;
  73:   b8 00 00 00 00          mov    $0x0,%eax
}
  78:   83 c4 14                add    $0x14,%esp
  7b:   59                      pop    %ecx
  7c:   5d                      pop    %ebp
  7d:   8d 61 fc                lea    0xfffffffc(%ecx),%esp
  80:   c3                      ret

Even if you do not know assembly instructions, you can still use this technique to improve the code by counting the total number of assembly instructions produced for a function before and after your code changes.

Compiler(gcc) also provides -S option to stop processing after it produces an assembly file.

Consider the following program t1.c

#include<stdlib.h>
typedef struct element
{
    int              val;
    struct element  *next;
} Element;
Element *head;

void insert(int no)
{
    Element *newNode = (Element *) malloc( sizeof( Element ) );
    newNode->val = no;
    newNode->next = NULL;
    if (NULL == head) { head = newNode; return ;}
    Element *iter = head;
    while (NULL != iter->next) iter = iter->next;
    iter->next = newNode;
    return;
}
int main()
{
    insert(10);
    return 0;
}
Generate the assembly file for the above program as follows

$gcc -O2 -S t1.c
gcc will produce an assembly output file whose file name is the same as the original c file (t1 in this example) with a .s suffix. It will display a long output, so it is better to redirect it to a file.

The output file contains the following

        .file   “t1.c”
        .text
        .p2align 4,,15
.globl insert
        .type   insert, @function
insert:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        movl    $8, (%esp)
        call    malloc
        movl    head, %edx
        testl   %edx, %edx
        movl    %eax, %ecx
        movl    8(%ebp), %eax
        movl    $0, 4(%ecx)
        movl    %eax, (%ecx)
        jne     .L8
        jmp     .L11
        .p2align 4,,7
.L5:
        movl    %eax, %edx
.L8:
        movl    4(%edx), %eax
        testl   %eax, %eax
        jne     .L5
        movl    %ecx, 4(%edx)
        leave
        .p2align 4,,2
        ret
.L11:
        movl    %ecx, head
        leave
        ret
        .size   insert, .-insert
        .p2align 4,,15
.globl main
        .type   main, @function
main:
        leal    4(%esp), %ecx
        andl    $-16, %esp
        pushl   -4(%ecx)
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %ecx
        subl    $4, %esp
….
The insert function has approximately 25 instructions with three branches.

If the program implements simple linked list, then we shall modify the program as follows (t2.c)

#include<stdio.h>
#include<stdlib.h>
typedef struct element
{
    int              val;
    struct element  *next;
} Element;
Element *head;
void insert(int no)
{
    Element *newNode = (Element *) malloc( sizeof( Element ) );
    newNode->val = no;
    newNode->next = head;
    head = newNode;
    return;
}
int main()
{
    insert(10);
    return 0;
}
Generate the assembly instruction

$gcc -O2 -S t2.c
t2.s contains

        .file   “tt.c”
        .text
        .p2align 4,,15
.globl insert
        .type   insert, @function
insert:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        movl    $8, (%esp)
        call    malloc
        movl    8(%ebp), %edx
        movl    %edx, (%eax)
        movl    head, %edx
        movl    %edx, 4(%eax)
        movl    %eax, head
        leave
        ret
        .size   insert, .-insert
        .p2align 4,,15
.globl main
        .type   main, @function
main:
        leal    4(%esp), %ecx
        andl    $-16, %esp
        pushl   -4(%ecx)
        pushl   %ebp
        movl    %esp, %ebp

Now insert() function contains only 12 instructions, we have optimized the code to run faster without changing the behavior. This code changes cannot be made if main() assumes that the linked list retrieval is in FIFO fashion.

Advertisements

Code Level Optimization Techniques

Some of the techniques mentioned below “arithmetic operators”, “jump table to replace if…elseif”, “faster for loops” makes the code hard to debug, un-portable and un-maintainable. If there is no other way to optimize, then only attempt these techniques.

Use pass by reference for user defined type

Passing parameters by value to functions, result in the complete parameter being copied on to the stack. Pass by value shall be replaced by const references to avoid this data copy. Passing bigger objects as return values also has the same performance issues; instead replace it as OUT parameters for that function.

Pre-compute quantities to speed up run-time calculations

Avoid constant computing inside the loops. Though most of the compilers do this optimization, it is still better if it is handled in our code rather than relying on the compiler.

for (int i=0; i <100; i++) {
    offset = offset + strlen(str);
}
Shall be replaced to 

int size = strlen(str);
for (int i=0; i <100; i++) {
    offset = offset + size;
}

Avoid system calls in time critical section of the code 

System calls are costly as it involves switching into kernel mode from user mode. For instances where two process needs to communicate, using shared memory is better than using message queues of pipes as shared memory does not incur any system calls to send and receive data.

Lazy / Minimal Evaluation 

Lazy or delayed evaluation is the technique of delaying a computation until the result of the computation is actually needed for that code path. 

For example, if the expressions are concatenated using logical ‘AND’ or ‘OR’ operator, the expression is evaluated from left to right. We shall gain CPU cycles by placing the expressions in correct position

Consider the code example

if (strcmp(str, “SYSTEM”) && flag == 1)

{

   //…

}

In the above code segment, strcmp() function takes lot of time and for cases where flag != 1, it still does the comparison wasting lot of CPU cycles. The efficient code is

if (flag ==1 && strcmp(str, “SYSTEM”))

{

   //…

}

The same is applicable for OR logical operator when flag == 1. In general, for expressions using operator &&, make the condition that is most likely to be false the leftmost condition. For expressions using operator ||, make the condition that is most likely to be true the leftmost condition

Frequently executed cases should come first in the switch case or if..elseif statement so that the number of comparisons is reduced for the frequently executed case.

The following switch statement is inefficient as integer datatype, which is used most of the time, is placed in the last case and then string data type. Double data type is seldom used in most of the applications, but it is placed in the first case.

switch(dataType) {
  case typeDouble: { doDoubleAction(); break; }
  case typeDate: {doDateAction();; break; }
  case typeShort: {doShortAction();.; break; }
  case typeString: {doStringAction(); break; }
  case typeTimeStamp: {doTimeStampAction(); break; }
  case typeInt: { doIntAction(); break; }
  default: {doDefaultAction(); break; }
}
This shall be made efficient by keeping the frequently used data types first and followed by least and seldom-used data types.

switch(dataType) {
  case typeInt: {…; break; }
  case typeString: {…; break; }
  case typeShort: {…; break; }
  case typeDouble: {…; break; }
  case typeTimeStamp: {…; break; }
  case typeDate: {…; break; }
  default: {…; break; }
}
Much more elegant way is to implement a jump table using function pointers. Consider the following code

typedef void (*functs)();
functs JumpTable[] = { doIntAction, doStringAction(), doShortAction() /* etc*/} ;

Place your function pointers in the same sequence of dataType enum. The above JumpTable assumes that DataType enum is defined as follows

enum DataType{
typeInt =0,
typeString,
typeShort,
/* other types */
};

To call the appropriate implementation just use the following statement

JumpTable[dataType]();

Now the compare operations are replaced with array indexing operation, which are much faster. Moreover the time taken for every data type is nearly the same as compared to if..elseif and switch() constructs.

Minimize local variables

If the number of local variables in a function is less, then the compiler will be able to fit them into registers, thereby avoiding access to the memory (stack). If no local variables need to be saved on the stack, the compiler need not set up and restore the frame pointer.

Reduce number of parameters

Function calls with large number of parameters may be expensive due to large number of parameter pushes on stack on each call. This is applicable even when struct is passed by value. 

Declare local functions as static

If the function is small enough, it may be inlined without having to maintain an external copy if declared as static.

Avoid using global variables in performance critical code

Global variables are never allocated to registers. Global variables can be changed by assigning them indirectly using a pointer, or by a function call. Hence, the compiler cannot cache the value of a global variable in a register, resulting in extra loads and stores when globals are used. 

Pointer chains

Pointer chains are frequently used to access information in structures. For example, a common code sequence is:

 typedef struct { int x, y, color; } Point;

 typedef struct { Point *pos, int something; } Pixel;

 void initPixel(Pixel*p)

 {

 p->pos->x = 0;

 p->pos->y = 0;

 p->pos->color =0;

 }

However, this code must reload p->pos for each assignment. A better version would cache p->pos in a local variable:

 void initPixel(Pixel *p)

 {

 Point *pos = p->pos;

 pos->x = 0;

 pos->y = 0;

 pos->color = 0;

 }

Another way is to include the Point structure in the Pixel structure, thereby avoiding pointers completely. Some compilers do provide this optimization by default.

Replace frequent read/write with mmap

If the application does lot of read/write calls, then you should seriously consider to convert it into mmap() . This does not incur data copy from kernel to user and vice versa and avoids kernel mode switch for executing the read() or write() system call. You can use fsync() call, once to flush all the data to the disk file after you write all your data into the mapped memory area. 

Place elements together, which are accessed together

When you declare a structure make sure that the data elements are declared in such a way that most frequently accessed elements at the beginning

For example if page element is accessed many times and hasSpace little less and freeNode rarely, then declare the structure as follows.

Struct Node{

Int page;

Int hasSpace;

Int freeNode;

///other types

};

This improves the performance because L1 or L2 cache miss, will not get only the required byte, it also gets one full cache line size of data into the appropriate cache. When page is cached, it also caches hasSpace if cache line is 64 bit.  L1 cache line size is usually 64 bits and for L2, it is 128 bits.

Arithmetic operator

Replacing multiplication and division by shift and add operations.

X * 2 shall be replaced to X+X or X<<1

X * N shall be replaced by X<<I where N is 2 ^ I

X / N shall be replaced by X >>I where N is 2 ^ I

Faster for() loops 

for( i=0; i<10; i++){ … } 

If the code inside the loop does not care about the order of the loop counter, we can do this instead:

for( i=10; i–; ) { … }

This runs faster because it is quicker to process i– as the test condition, which says “Is i non-zero? If so, decrement it and continue”. For the original code, the processor has to calculate “Subtract i from 10. Is the result non-zero? If so, increment i and continue.” 

Prefer int over char and short

If you have an integer value that can fit in a byte, you should still consider using an int to hold the number. This is because, when you use a char, the compiler will first convert the values into integer, perform the operations and then convert back the result to char. Doing so, will increase the memory usage but decreases the CPU cycles 

Advise OS on how it should manage memory pages

Using madvise()  system call, we can specify how the swapping and prefetching should be handled by the operating system. If you are accessing data sequentially, OS can prefetch some pages and keep it ready before the program references it. If program accesses data in some memory location frequently, it shall be locked in physical memory avoiding page swap to disk, using mlock() system call.

Browsing Large source base using cscope

cscope is a developer’s tool for browsing source code. It allows searching code for: all references to a symbol, global definitions, functions etc.

Here is a simple example to illustrate the problem:

main()

{

FILE *p;

va_list p;

}

It’s quite difficult for a beginner to understand what is FILE & va_list. Simply you are made to know that p is a pointer to the file structure & va_list creates internally a variable argument list. But do you feel this information to be sufficient enough to clarify the doubt. Absolutely not, there you should surely go for the cscope.

cscope is used in two phases. First a developer builds the cscope database using find command to get the list of filenames that need to index into a file called cscope files.

Syntax:

$ find /usr/include/ -name “*.h”>myfile

This will include all header files to generate a database.

Second, the developer builds symbol database using cscope by specifying the file name list created in the above stop.

$ cscope -i myfile

This will take some time to build the symbol database & cscope will display the following input menu. Use up and down arrow key to move to the desired input menu.

 

Find this C symbol:

Find this global definition:

Find functions called by this function:

Find functions calling this function:

Find this text string:

Change this text string:

Find this egrep pattern:

Find this file:

 

Once the cscope cross reference database is built (usually named cscope.out and is generated in the current directory where cscope is run), you can search the database using ‘-d’ option of cscope. If you change the source files, then you may have to rebuild the symbol database.

Note: Make sure that when you are running cscope with –d option, cscope.out (symbol database) is present in the directory where are executing the cscope command.

Working with cscope

Open two terminals, one for writing the program & another to browse the code.

Invoke cscope in the second terminal, it will display two paragraphs, the top paragraph displays the results and bottom paragraph displays the options and accepts input from the user. Type va_list under “Find this global definition” menu as show below. 

It will display the following

File Line

0 slang.h 1012 SL_EXTERN void (*SLang_VMessage_Hook) (char *, va_list);

1 slang.h 1019 SL_EXTERN void (*SLang_Exit_Error_Hook)(char *, va_list);

2 sqlite3ext.h 151 char *(*vmprintf)(const char *,va_list);

3 stdio.h 80 typedef _G_va_list va_list;

4 tiffio.h 259 typedef void (*TIFFErrorHandler)(const char *, const char *,

va_list);

* 3 more lines – press the space bar to display more *

Find this C symbol:

Find this global definition: va_list

Find functions called by this function:

Find functions calling this function:

Find this text string:

Change this text string:

Find this egrep pattern:

Find this file: 

The top paragraph which displays the result, contains identifiers for each result line. In our case, it displays numbers 0 to 4. The output also contains file name, line number and the exact string which matched the input string specified. By pressing the appropriate key (in our case 0 to 4), automatically vi opens that file.  When you exit from the vi editor using “:q” you will be taken back to the cscope window. Press CTRL-D to exit from the cscope window.

Commands console to deal with cscope

:q – To go back to the menu.

Ctrl+d – To stop cscope or change the marked lines and exit

Up-arrow & down arrow – To move the cursor in the menu list.

Tab – For moving the cursor from upper paragraph to lower paragraph.

Space bar -To display more lines

0-9a-zA-Z – To edit the file referenced by the given identifier in the top output paragraph

                     or to mark or unmark lines to be changed

+ – Displays next set of matching lines

– – Displays previous set of matching lines

* – Mark or unmark all displayed lines to be changed

Esc – Exit without changing the marked line. 

Advantages of cscope

It is very useful when working on large project which contain thousands of source code files. It reduces the development effort by multi fold and increases the productivity.

  • While development, it is difficult for developers to remember all the structure names and its members. Often we search the required header file to know the members of a structure. This can be done easily using cscope
  • Often we need to know, before modifying a function (adding new arguments, changing implementation, etc), we need to make sure that all the callers of that function do not get affected by this modification. To do this we need to know first what functions are calling these functions. cscope tool comes handy to reduce your effort
  • When we are changing MACRO name or function name, all the files which references that macro or calls that function needs to be changed. With cscope you can do this much faster than doing it manually by editing each and every source file after searching that macro.
  • It helps maintenance engineers to understand the code flow of huge source base written by somebody else. If this tool is used together with “gdb”, it provides both static and run time code flow of the part of the code under investigation for bug analysis. This reduces the bug analysis and fixing time for maintenance engineers drastically

 

Debugging Techniques

Finding bug is a process of confirming the many things you believe are true, until you find the one, which is not true.
Below are some of the things, which developers believe they are true

When the function is called, it receives its parameters correctly.

At a certain line or function in your source file, a certain variable has a certain value.

In a given if-then-else statement, the “else” part is the one that is executed.

Divide and Conquer

The simple technique for debugging is monitoring, write printf statements in the source code at which you know the data is good and where it becomes corrupted.
As we do not know where the data is good and bad, usually ad hoc addition of lots of printf statements are added to track the control flow and data values during the execution of code. After adding printf statements, run the program and keep adding printf statements until you isolate the area in the program that contains the wrong data. Then concentrate on why and how this data gets modified.
In this way, you can start zeroing in on the section of code that contains the error.

Debug Macros

The preprocessor conditional compilation directives can be used as switch to include or not to include the debug messages.

For example:

#ifdef DEBUG
    printf(“sum %d\n”, sum);
#endif

The program can be compiled with DEBUG undefined for normal use;

you can define it when debugging is needed as follows

#define DEBUG
The following program prints the debug messages

#define DEBUG
int main()
{
    char *iter = (char*) malloc (100);
    strcpy(iter, “This is a simple program”);
    int sum;
    for (int i=0; i <100; i++) {
        #ifdef DEBUG
            printf(“char is %c and sum is %d\n”, *iter, sum);
        #endif
        sum = sum + *iter;
    }
    return 0;
}

You shall redirect the output to a file what would normally go to the screen. For example:

$test >tmp.out

runs the program test with debugging and sends the output to the file tmp.out.

You can use your text editor’s search capabilities to look for the information you want to find.

For large programs which has millions of code, the above mechanism will dump many millions of debug messages which makes the bug finding process difficult. We can use multiple debug level to tackle this issue. Based on the code size of the software, you shall define many debugging levels. Lower debugging levels produce less messages and higher the level, more messages it produces. When debugging an issue, start with lower level and then if you cannot figure out the cause, move to the next level and so on.

The following set of macros defines three debugging levels, DLEVEL3, DLEVEL2 and DLEVEL1.

#ifdef DLEVEL3
   #define DLEVEL2
#endif
#ifdef DLEVEL2
    #define DLEVEL1
#endif

If you software constitutes many modules, then module specific debug messages can be enabled to see only messages related to that module.

Run Time Tracing

The above techniques require source code access to identify the problem. Run time tracing allows the debugger to identify the problem on executable. This technique shall also be used to understand the code flow of the program. There are tools which traces all system calls or library routines.

strace utility provides the capability to trace the execution of an application from the perspective of system calls. Along with the system call routine, it also displays their arguments, return value, etc

Linux permits to access 4 GB virtual address space for a process and that 4 GB virtual space is divided into two parts such as user space ( 3 GB) and kernel space (1 GB). When we run any application program along with library , they allocates memory from user space . Here in the following example printf is a library function and internally uses a standard library libc.so. The function printf writes data “Hello” into monitor , but how it happens.

Ex.

void main()

{

printf(“Hello”);

}

Operating system uses system call to copy the data from user space to kernel space and write into device file which is associated with a device ( example stdout for monitor) . The driver known as modules works in kernel space which reads the data from a device file and write into the port .

To list out the system call maintained by o/s there is a development tool called strace .Ittrace the system calls and signals . It intercepts and records the system calls made by a running process. strace can print a record of each system call, its arguments, and its return value. strace is a system call tracer, i.e. a debugging tool which prints out a trace of all the system calls .

$strace ./singo

output:

open(“/etc/ld.so.cache”, O_RDONLY) = 3

fstat64(3, {st_mode=S_IFREG|0644, st_size=111023, …}) = 0

mmap2(NULL, 111023, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb7f91000

close(3) = 0

open(“/lib/libc.so.6”, O_RDONLY) = 3

read(3, “\177ELF\1\1\1\3\3\1\360\3247004″…, 512) = 512

write(1, “Hello”, 5Hello) = 5

Each line starts with a system call name, is followed by its arguments in the parenthesis & then has return value at the end of the line. The important system call made in the above output are open,close,read,write. The last line write(1, “Hello”, 5Hello) = 5 line corresponding to the system call that cause Hello to be printed on the screen. The first argument -1 is the file descriptor of the file i.e. stdout to write to. So it writes the message on the terminal instead of writing in some other file on disk. The second argument tells the data is a string & third argument display the total number of character in the string.

$ strace -c ./singo

Hello% time seconds usecs/call calls errors syscall

—— ———– ———– ——— ——— —————-

nan 0.000000 0 1 read

nan 0.000000 0 1 write

nan 0.000000 0 2 open

nan 0.000000 0 2 close

nan 0.000000 0 1 execve

nan 0.000000 0 1 1 access

nan 0.000000 0 1 brk

nan 0.000000 0 1 munmap

nan 0.000000 0 2 mprotect

nan 0.000000 0 7 mmap2

nan 0.000000 0 3 fstat64

nan 0.000000 0 1 set_thread_area

—— ———– ———– ——— ——— —————-

100.00 0.000000 23 1 total

From the above output, it is evident that write system call is used 1time. This is initiated by printf() statement on stdout file descriptor.

One another important option of strace is -tt, it causes strace to print out the time at which each call finished.

ltrace can be used to monitor user library calls. Most of the option present in strace is also present in ltrace. For the complete set of options run `man ltrace`.

Interactive Debugging

Interactive debuggers give you the ability to stop the program at any point, examine and change variables, and continue the program to identify the code which changes the behavior and makes the program buggy. The GNU Debugger (also known as gdb) is an interactive debugging tool which allows you to see what is going on inside the running program while it executes.

Introspection

Linux provides lot of information and tools to investigate the running process (your executable). It stores information about each of the process running in the system at /proc file system.
This directory has lot of subdirectories each denoting the process ID of the running process in the system.

$ls /proc
1     204   2481  2927   30479  399   9            kallsyms    swaps
10    2040  2518  2996   30535  4     acpi         kcore       sys
11    2047  2526  3      3094   400   asound       keys        sysrq-trigger
1477  205   2538  3001   3096   404   buddyinfo    key-users   sysvipc
1478  2092  2582  3002   3102   411   bus          kmsg        timer_list
1502  2173  2654  3003   3103   418   cmdline      loadavg     timer_stats
1653  2207  2690  3004   3114   429   cpuinfo      locks       tty
1654  2226  2711  3005   3115   457   crypto       mdstat      uptime
167   2237  2733  3115  3137   494   devices      meminfo     version
168   2249  2766  5218  3163   5     diskstats    misc        vmcore
169   2264  2778  30146  3165   5051  dma          modules     vmstat
172   2302  2779  30283  3523   5052  driver       mounts      zaptel
174   2319  2796  30285  3525   51    execdomains  mtrr        zoneinfo
1990  2335  2797  30367  3526   52    fb           net
1992  2354  2808  30398  355    53    filesystems  partitions
2     2359  2809  30421  388    5555  fs           schedstat
201   2381  2838  30442  389    5557  interrupts   scsi
202   2395  2839  30444  390    6     iomem        self
2020  2408  2847  30445  397    7     ioports      slabinfo
203   2431  2859  30477  398    8     irq          stat

Find out the process ID of you program using the ps command, lets say your executable name is csqlserver, then

$ps -ef |grep csqlserver
csql      5218  3115  0 13:35 pts/1    00:00:00 csqlserver
csql      5370  3526  0 13:42 pts/3    00:00:00 grep csqlserver

For the above output, pid is 5218.
Under this directory (/proc/5218) you will find lots of useful information which helps in debugging your application
Some of the important ones are
cmdline – contains the command line options with the executable name with which you started the process
environ – contains the environment variables like PATH, HOME,etc this process uses
maps – contains the memory map of the process(similar to output of pmap command)
fd – directory which contains all the file descriptors(you will see minimum 0,1 and 2 file descriptors which refers to stdin, stdout, stderr respectively)
cwd – link to current working directory
status – process status

Linux Tools
ps , pmap, top, vmstat
tools displays information about running process in the system which helps developers to identify issues such as CPU consumption, high disk I/O, high memory usage by the programs.