the sample code test_a.c as below:
#include <stdio.h>
#include <stdint.h>
#include <string.h>
void func3(void)
{
   int count = 0;
   char src[100];
   char dst[100];
   for(count=0; count < 0XFF; count++)
       memcpy(src,dst, sizeof(src));
   return;
}
void func2()
{
   int count = 0;
   int64_t s =1;
   for(count=0; count < 0XFF; count++)
   {
       s =s *(count+1);
       func3();
   }
   return;
}
void func4()
{
   int count = 0;
   int64_t s =1;
   for(count=0; count < 0XFF; count++)
   {
       s =s *(count+1);
       func3();
   }
   return;
}
void func1(void)
{
   int count = 0;
   for(count=0; count < 0XFFFF; count++)
       func2();
   return;
}
int main(void)
{
    printf("\n Hello World! \n");
    func1();
    printf("\n step 2! \n");
    func4();
    return 0;
}
#compiling with:
gcc -Wall  test_a.c -g -o test_a
#install perf on ubuntu 14:
 sudo apt-get install linux-tools-common linux-tools-generic linux-tools-`uname -r`
#run test_a:
./test_a
#find test_a pid as 21033 through
ps aux|grep test_a
sudo perf record -p 21033
#ctrl+c to break
sudo perf report
it will show the profiling result as below:
now you know the bottle neck--- func3()
you can also see real time cpu usage by :
sudo perf top
other option -g
perf record -g -p pid
perf report -g 'graph,0.5,caller'
perf report --max-stack=6 --stdio -s parent
 
ref: http://rhaas.blogspot.co.uk/2012/06/perf-good-bad-ugly.html