Commit 3220e838 authored by Priyal Suneja's avatar Priyal Suneja
Browse files

added perf script

parent d2a98685
......@@ -15,7 +15,7 @@ l1_populate: l1_populate.c utils.c msr.c
gcc -O0 -Wall -o $(BUILDDIR)/l1_populate l1_populate.c utils.c msr.c -lm
l2_msr: l2_msr.c utils.c msr.c
gcc -O0 -Wall -o $(BUILDDIR)/l2_msr l2_msr.c utils.c msr.c -lm
gcc -O0 -g -Wall -o $(BUILDDIR)/l2_msr l2_msr.c utils.c msr.c -lm
l2: l2.c utils.c msr.c
gcc -O0 -Wall -o $(BUILDDIR)/l2 l2.c utils.c msr.c -lm
......@@ -29,6 +29,9 @@ ins_msr: ins_msr.c msr.c utils.c
qs_msr: qs_msr.c msr.c utils.c
gcc -O0 -Wall -o $(BUILDDIR)/qs_msr qs_msr.c utils.c msr.c -lm
mm_msr: mm_msr.c msr.c utils.c
gcc -O0 -g -Wall -o $(BUILDDIR)/mm_msr mm_msr.c utils.c msr.c -lm
ins: ins.c msr.c utils.c
gcc -O0 -Wall -o $(BUILDDIR)/ins ins.c msr.c utils.c -lm
......
......@@ -13,10 +13,16 @@ int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
long long result;
double package_before,package_after;
double dram_before,dram_after;
struct ll *head = malloc(sizeof(struct ll));
struct ll *curr = head;
int retval = populate_list(head, L1_LL_SIZE);
// struct ll *head = malloc(sizeof(struct ll));
// struct ll *curr = head;
//
// int retval = populate_list(head, L1_LL_SIZE);
int retval;
int arr[L1_LL_SIZE];
for(int i = 0; i < L1_LL_SIZE; i++) {
arr[i] = L1_LL_SIZE - i;
}
fd=open_msr(0); // todo: add package detection + map and stuff
......@@ -36,10 +42,16 @@ int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
close(fd);
for(int i = 0; i < ITERATIONS_PER_RUN; i++) {
while(curr != NULL) {
curr = curr->next;
int index = 0;
while(index < L1_SIZE) {
int j = index;
for(j; j < L1_LL_SIZE; ) {
retval += arr[j];
j += L1_SIZE;
}
index++;
}
curr = head;
}
fd = open_msr(0);
......
......@@ -5,17 +5,19 @@
*/
#include "msr.h"
int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
double *r1, double *r2) {
int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2], double *r1, double *r2) {
int dram_avail = 2;
int fd;
long long result;
double package_before,package_after;
double dram_before,dram_after;
struct ll *head = malloc(sizeof(struct ll));
struct ll *curr = head;
int retval = 0;
int retval = populate_list(head, L2_LL_SIZE);
int *arr = malloc(L2_LL_SIZE*sizeof(int));
for(int i = 0; i < L2_LL_SIZE; i++) {
arr[i] = L2_LL_SIZE - i;
}
fd=open_msr(0); // todo: add package detection + map and stuff
......@@ -35,10 +37,16 @@ int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
close(fd);
for(int i = 0; i < ITERATIONS_PER_RUN; i++) {
while(curr != NULL) {
curr = curr->next;
int index = 0;
while(index < L2_SIZE) {
int j = index;
for (j; j < L2_LL_SIZE;) {
retval += arr[j];
j += L2_SIZE;
}
index++;
}
curr = head;
}
fd = open_msr(0);
......
/*
* author: Priyal Suneja ; suneja@cs.washington.edu
*
* to run: sudo ./build/mm_msr
*/
#include "msr.h"
void matrix_multiply(int one[MM_SIZE][MM_SIZE], int two[MM_SIZE][MM_SIZE], int result[MM_SIZE][MM_SIZE]) {
int rows1 = MM_SIZE;
int cols2 = MM_SIZE;
int common = MM_SIZE; //cols of 1 and rows of 2
for(int rr = 0; rr < rows1; rr++) {
for(int rc = 0; rc < cols2; rc++) {
result[rr][rc] = 0;
}
}
for(int i = 0; i < rows1; i++) {
for(int j=0; j < cols2; j++) {
for(int k = 0; k < common; k++) {
result[i][j] += one[i][k]*two[k][j];
}
}
}
}
int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
double *r1, double *r2) {
int dram_avail = 2;
int fd;
long long result;
double package_before,package_after;
double dram_before,dram_after;
int one[MM_SIZE][MM_SIZE], two[MM_SIZE][MM_SIZE], res[MM_SIZE][MM_SIZE];
for(int i = 0; i < MM_SIZE; i++) {
for(int j = 0; j < MM_SIZE; j++) {
one[i][j] = i*j;
two[i][j] = i+j;
}
}
fd=open_msr(0); // todo: add package detection + map and stuff
/* Package Energy */
result=read_msr(fd,MSR_PKG_ENERGY_STATUS);
package_before=(double)result*energy_units[0];
/* Updated documentation (but not the Vol3B) says Haswell and */
/* Broadwell have DRAM support too */
if (cpu_info[dram_avail]) {
result=read_msr(fd,MSR_DRAM_ENERGY_STATUS);
dram_before=(double)result*energy_units[1];
}
close(fd);
for(int i = 0; i < ITERATIONS_PER_RUN; i++) {
matrix_multiply(one, two, res);
}
fd = open_msr(0);
result=read_msr(fd,MSR_PKG_ENERGY_STATUS);
package_after=(double)result*energy_units[0];
*r1 = package_after - package_before;
if (cpu_info[dram_avail]) {
result=read_msr(fd,MSR_DRAM_ENERGY_STATUS);
dram_after=(double)result*energy_units[1];
*r2 = dram_after - dram_before;
}
close(fd);
// return retval;
return 1;
}
int main (int argc, char* argv[]) {
double package_measure[RUNS];
double dram_measure[RUNS];
int cpu_info[3]; // 0 -> pp0, 1-> pp1, 2-> dram
double energy_units[2]; // 0 -> cpu, 1 -> dram
get_cpu_info(CPU_HASWELL_EP, cpu_info, energy_units);
printf("---------------------------------------\n");
measure_msr(CPU_HASWELL_EP, cpu_info, energy_units,
(package_measure + 0), (dram_measure + 0));
print_avg(package_measure, RUNS);
// print_avg(dram_measure);
return 0;
}
......@@ -16,14 +16,15 @@
#define RUNS 1
#define ITERATIONS_PER_RUN 1000
#define L1_SIZE 5*32*1024
#define L2_SIZE 5*256*1024
#define L1_SIZE 32*1024
#define L2_SIZE 256*1024
#define PAGE_SIZE 4*1024
#define TLB_ASSOC 4
#define TLB_ENTRIES 64
#define L1_LL_SIZE 5*L1_SIZE
#define L2_LL_SIZE 1*L2_SIZE
#define L1_LL_SIZE 5*5*L1_SIZE
#define L2_LL_SIZE 5*5*L2_SIZE
#define TLB_LL_SIZE TLB_ASSOC*TLB_ENTRIES*8*10
#define MM_SIZE L1_SIZE/4
#define MSR_RAPL_POWER_UNIT 0x606
......
/*
* author: Priyal Suneja ; suneja@cs.washington.edu
*
* to run: sudo ./build/l1_msr
*/
#include "msr.h"
int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
double *r1, double *r2) {
int dram_avail = 2;
int fd;
long long result;
double package_before,package_after;
double dram_before,dram_after;
struct ll *head = malloc(sizeof(struct ll));
struct ll *curr = head;
int retval = populate_list(head, L1_LL_SIZE);
fd=open_msr(0); // todo: add package detection + map and stuff
/* Package Energy */
result=read_msr(fd,MSR_PKG_ENERGY_STATUS);
package_before=(double)result*energy_units[0];
/* Updated documentation (but not the Vol3B) says Haswell and */
/* Broadwell have DRAM support too */
if (cpu_info[dram_avail]) {
result=read_msr(fd,MSR_DRAM_ENERGY_STATUS);
dram_before=(double)result*energy_units[1];
}
close(fd);
for(int i = 0; i < ITERATIONS_PER_RUN; i++) {
while(curr != NULL) {
curr = curr->next;
}
curr = head;
}
fd = open_msr(0);
result=read_msr(fd,MSR_PKG_ENERGY_STATUS);
package_after=(double)result*energy_units[0];
*r1 = package_after - package_before;
if (cpu_info[dram_avail]) {
result=read_msr(fd,MSR_DRAM_ENERGY_STATUS);
dram_after=(double)result*energy_units[1];
*r2 = dram_after - dram_before;
}
close(fd);
return retval;
}
int main (int argc, char* argv[]) {
double package_measure[RUNS];
double dram_measure[RUNS];
int cpu_info[3]; // 0 -> pp0, 1-> pp1, 2-> dram
double energy_units[2]; // 0 -> cpu, 1 -> dram
get_cpu_info(CPU_HASWELL_EP, cpu_info, energy_units);
printf("---------------------------------------\n");
measure_msr(CPU_HASWELL_EP, cpu_info, energy_units,
(package_measure + 0), (dram_measure + 0));
print_avg(package_measure, RUNS);
// print_avg(dram_measure);
return 0;
}
/*
* author: Priyal Suneja ; suneja@cs.washington.edu
*
* to run: sudo ./build/l2_msr
*/
#include "msr.h"
int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
double *r1, double *r2) {
int dram_avail = 2;
int fd;
long long result;
double package_before,package_after;
double dram_before,dram_after;
struct ll *head = malloc(sizeof(struct ll));
struct ll *curr = head;
int retval = populate_list(head, L2_LL_SIZE);
fd=open_msr(0); // todo: add package detection + map and stuff
/* Package Energy */
result=read_msr(fd,MSR_PKG_ENERGY_STATUS);
package_before=(double)result*energy_units[0];
/* Updated documentation (but not the Vol3B) says Haswell and */
/* Broadwell have DRAM support too */
if (cpu_info[dram_avail]) {
result=read_msr(fd,MSR_DRAM_ENERGY_STATUS);
dram_before=(double)result*energy_units[1];
}
close(fd);
for(int i = 0; i < ITERATIONS_PER_RUN; i++) {
while(curr != NULL) {
curr = curr->next;
}
curr = head;
}
fd = open_msr(0);
result=read_msr(fd,MSR_PKG_ENERGY_STATUS);
package_after=(double)result*energy_units[0];
*r1 = package_after - package_before;
if (cpu_info[dram_avail]) {
result=read_msr(fd,MSR_DRAM_ENERGY_STATUS);
dram_after=(double)result*energy_units[1];
*r2 = dram_after - dram_before;
}
close(fd);
return retval;
}
int main (int argc, char* argv[]) {
double package_measure[RUNS];
double dram_measure[RUNS];
int cpu_info[3]; // 0 -> pp0, 1-> pp1, 2-> dram
double energy_units[2]; // 0 -> cpu, 1 -> dram
get_cpu_info(CPU_HASWELL_EP, cpu_info, energy_units);
printf("---------------------------------------\n");
// for(int i = 0; i < RUNS; i++) {
measure_msr(CPU_HASWELL_EP, cpu_info, energy_units,
(package_measure + 0), (dram_measure + 0));
// }
print_avg(package_measure, RUNS);
// print_avg(dram_measure);
return 0;
}
......@@ -24,7 +24,6 @@ int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
qs_arr[L2_SIZE-i] = i;
}
fd=open_msr(0); // todo: add package detection + map and stuff
/* Package Energy */
......@@ -42,7 +41,7 @@ int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
close(fd);
for(int i = 0; i < ITERATIONS_PER_RUN; i++) {
qsort(qs_arr, L2_SIZE, int(qsort), cmpfnc);
qsort(qs_arr, L2_SIZE, sizeof(int), cmpfunc);
}
fd = open_msr(0);
......@@ -59,7 +58,8 @@ int measure_msr(int cpu_model, int cpu_info[3], double energy_units[2],
}
close(fd);
return retval;
// return retval;
return 1;
}
......
......@@ -43,7 +43,7 @@ then
python3 ${a[0]}/graph_relative_error.py ${a[4]}/graph_out_one ${a[4]}/ipc_input
fi
mkdir ${a[6]}
mkdir ${a[6]} > /dev/null
mv ${a[2]} output_table *.png ${a[6]}
rm -rf ${a[4]}
********* bc *********
energy consumed: 17.105103
energy consumed: 18.268188
Performance counter stats for '/homes/sys/suneja/treehouse/single-server-etrace/benchmarks/gapbs/build/bc -g 10 -n 1':
1,280,013 L1-icache-load-misses (40.05%)
5,957,208,261 cycles (53.31%)
3,906,564,912 cycle_activity.cycles_no_execute (54.58%)
2,353,367,718 instructions # 0.40 insn per cycle (66.79%)
1,118,310 l2_rqsts.miss (68.25%)
347,481 dTLB-load-misses (69.68%)
2,119,542 L1-dcache-load-misses (70.49%)
42,203 iTLB-load-misses (55.98%)
1,567 LLC-load-misses (40.97%)
1,398,574 L1-icache-load-misses (41.35%)
6,190,266,273 cycles (53.55%)
4,098,361,581 cycle_activity.cycles_no_execute (55.93%)
2,444,308,374 instructions # 0.39 insn per cycle (67.62%)
1,231,198 l2_rqsts.miss (69.22%)
323,928 dTLB-load-misses (70.94%)
2,426,681 L1-dcache-load-misses (69.79%)
35,632 iTLB-load-misses (53.40%)
2,686 LLC-load-misses (39.36%)
0.173108112 seconds time elapsed
0.185722145 seconds time elapsed
********* bfs *********
energy consumed: 12.492126
energy consumed: 12.518921
Performance counter stats for '/homes/sys/suneja/treehouse/single-server-etrace/benchmarks/gapbs/build/bfs -g 10 -n 1':
1,108,723 L1-icache-load-misses (38.58%)
4,256,090,064 cycles (51.79%)
2,820,788,119 cycle_activity.cycles_no_execute (55.06%)
1,699,870,430 instructions # 0.40 insn per cycle (66.50%)
973,942 l2_rqsts.miss (70.49%)
225,176 dTLB-load-misses (72.08%)
1,691,443 L1-dcache-load-misses (71.50%)
34,146 iTLB-load-misses (53.37%)
889 LLC-load-misses (38.93%)
1,071,280 L1-icache-load-misses (38.65%)
4,307,324,594 cycles (50.40%)
2,868,864,012 cycle_activity.cycles_no_execute (54.08%)
1,713,426,262 instructions # 0.40 insn per cycle (66.44%)
1,043,593 l2_rqsts.miss (67.20%)
236,834 dTLB-load-misses (69.27%)
1,838,491 L1-dcache-load-misses (71.62%)
25,868 iTLB-load-misses (56.30%)
2,649 LLC-load-misses (42.88%)
0.127026523 seconds time elapsed
0.128726234 seconds time elapsed
********* cc *********
energy consumed: 11.870361
energy consumed: 11.676025
Performance counter stats for '/homes/sys/suneja/treehouse/single-server-etrace/benchmarks/gapbs/build/cc -g 10 -n 1':
1,052,401 L1-icache-load-misses (38.78%)
4,010,943,992 cycles (51.83%)
2,642,775,052 cycle_activity.cycles_no_execute (55.34%)
1,604,789,365 instructions # 0.40 insn per cycle (68.06%)
769,580 l2_rqsts.miss (69.25%)
183,421 dTLB-load-misses (70.36%)
1,569,755 L1-dcache-load-misses (71.51%)
27,965 iTLB-load-misses (54.34%)
621 LLC-load-misses (40.43%)
1,049,290 L1-icache-load-misses (38.22%)
4,018,704,957 cycles (49.84%)
2,649,330,692 cycle_activity.cycles_no_execute (51.75%)
1,597,239,477 instructions # 0.40 insn per cycle (63.41%)
912,903 l2_rqsts.miss (65.80%)
145,763 dTLB-load-misses (69.54%)
1,550,234 L1-dcache-load-misses (74.39%)
21,449 iTLB-load-misses (57.32%)
389 LLC-load-misses (42.98%)
0.121917271 seconds time elapsed
0.121783479 seconds time elapsed
********* cc_sv *********
energy consumed: 12.259216
energy consumed: 11.397766
Performance counter stats for '/homes/sys/suneja/treehouse/single-server-etrace/benchmarks/gapbs/build/cc_sv -g 10 -n 1':
1,072,745 L1-icache-load-misses (38.19%)
4,082,037,613 cycles (50.62%)
2,674,615,141 cycle_activity.cycles_no_execute (53.20%)
1,630,538,924 instructions # 0.40 insn per cycle (65.65%)
910,821 l2_rqsts.miss (69.13%)
179,548 dTLB-load-misses (72.39%)
1,526,144 L1-dcache-load-misses (71.92%)
23,777 iTLB-load-misses (55.82%)
1,334 LLC-load-misses (39.35%)
1,088,443 L1-icache-load-misses (40.39%)
3,921,869,128 cycles (53.09%)
2,581,254,413 cycle_activity.cycles_no_execute (53.35%)
1,563,640,369 instructions # 0.40 insn per cycle (66.49%)
889,677 l2_rqsts.miss (68.03%)
176,630 dTLB-load-misses (68.23%)
1,635,704 L1-dcache-load-misses (71.85%)
23,725 iTLB-load-misses (56.33%)
471 LLC-load-misses (41.82%)
0.125100855 seconds time elapsed
0.116843799 seconds time elapsed
********* graph500 *********
energy consumed: 5306.928711
energy consumed: 9446.554016
Performance counter stats for 'mpirun --allow-run-as-root --mca orte_base_help_aggregate 0 /homes/sys/suneja/treehouse/single-server-etrace/benchmarks/graph500/build/err_bfs 10':
57,505,330 L1-icache-load-misses (44.47%)
1,066,815,891,182 cycles (55.58%)
525,134,753,425 cycle_activity.cycles_no_execute (55.56%)
867,362,244,260 instructions # 0.81 insn per cycle (66.66%)
62,625,297 l2_rqsts.miss (66.64%)
3,464,834 dTLB-load-misses (66.64%)
141,176,835 L1-dcache-load-misses (66.66%)
2,595,349 iTLB-load-misses (55.56%)
663,912 LLC-load-misses (44.47%)
84,386,768 L1-icache-load-misses (44.44%)
1,903,679,492,119 cycles (55.55%)
941,000,533,585 cycle_activity.cycles_no_execute (55.55%)
1,526,777,735,404 instructions # 0.80 insn per cycle (66.66%)
82,773,037 l2_rqsts.miss (66.66%)
6,097,256 dTLB-load-misses (66.67%)
231,543,598 L1-dcache-load-misses (66.67%)
4,953,584 iTLB-load-misses (55.56%)
700,451 LLC-load-misses (44.44%)
51.242779156 seconds time elapsed
91.182001159 seconds time elapsed
********* graph500 *********
energy consumed: 2396.366150
energy consumed: 1210.031494
Performance counter stats for 'mpirun --allow-run-as-root --mca orte_base_help_aggregate 0 /homes/sys/suneja/treehouse/single-server-etrace/benchmarks/graph500/build/err_bfs 5':
27,875,712 L1-icache-load-misses (44.40%)
484,571,530,485 cycles (55.50%)
242,228,828,226 cycle_activity.cycles_no_execute (55.52%)
376,769,298,968 instructions # 0.78 insn per cycle (66.66%)
40,247,386 l2_rqsts.miss (66.67%)
2,317,000 dTLB-load-misses (66.69%)
90,245,854 L1-dcache-load-misses (66.70%)
1,513,399 iTLB-load-misses (55.59%)
576,917 LLC-load-misses (44.43%)
22,279,424 L1-icache-load-misses (44.50%)
247,684,672,026 cycles (55.61%)
126,604,996,271 cycle_activity.cycles_no_execute (55.52%)
178,710,216,603 instructions # 0.72 insn per cycle (66.60%)
29,491,315 l2_rqsts.miss (66.60%)
2,250,612 dTLB-load-misses (66.59%)
52,137,220 L1-dcache-load-misses (66.66%)
1,453,774 iTLB-load-misses (55.62%)
609,794 LLC-load-misses (44.52%)
23.504193532 seconds time elapsed
12.276787029 seconds time elapsed
********* graph500 *********
energy consumed: 3501.399292
energy consumed: 7163.584167