/***************************************************************************** * McPAT/CACTI * SOFTWARE LICENSE AGREEMENT * Copyright 2012 Hewlett-Packard Development Company, L.P. * All Rights Reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer; * redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution; * neither the name of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” * ***************************************************************************/ #include "nuca.h" #include "Ucache.h" #include unsigned int MIN_BANKSIZE=65536; #define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */ #define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */ #define CONTR_2_BANK_LAT 0 int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */]; Nuca::Nuca( TechnologyParameter::DeviceType *dt = &(g_tp.peri_global) ):deviceType(dt) { init_cont(); } void Nuca::init_cont() { FILE *cont; char line[5000]; char jk[5000]; cont = fopen("contention.dat", "r"); if (!cont) { cout << "contention.dat file is missing!\n"; exit(0); } for(int i=0; i<2; i++) { for(int j=2; j<5; j++) { for(int k=0; k nuca_list; Router *router_s[ROUTER_TYPES]; router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global)); router_s[0]->print_router(); router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global)); router_s[1]->print_router(); router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global)); router_s[2]->print_router(); int core_in; // to store no. of cores /* to search diff grid organizations */ double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat, curr_acclat; double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power, avg_leakage_power; double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF; int opt_rows = 0; int opt_columns = 0; double opt_totno_hops = 0; double opt_avg_hop = 0; double opt_dyn_power = 0, opt_leakage_power = 0; min_values_t minval; int bank_start = 0; int flit_width = 0; /* vertical and horizontal hop latency values */ int ver_hop_lat, hor_hop_lat; /* in cycles */ /* no. of different bank sizes to consider */ int iterations; g_ip->nuca_cache_sz = g_ip->cache_sz; nuca_list.push_back(new nuca_org_t()); if (g_ip->cache_level == 0) l2_c = 1; else l2_c = 0; if (g_ip->cores <= 4) core_in = 2; else if (g_ip->cores <= 8) core_in = 3; else if (g_ip->cores <= 16) core_in = 4; else {cout << "Number of cores should be <= 16!\n"; exit(0);} // set the lower bound to an appropriate value. this depends on cache associativity if (g_ip->assoc > 2) { i = 2; while (i != g_ip->assoc) { MIN_BANKSIZE *= 2; i *= 2; } } iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE); if (g_ip->force_wiretype) { if (g_ip->wt == Low_swing) { wt_min = Low_swing; wt_max = Low_swing; } else { wt_min = Global; wt_max = Low_swing-1; } } else { wt_min = Global; wt_max = Low_swing; } if (g_ip->nuca_bank_count != 0) { // simulate just one bank if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 && g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 && g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) { fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n"); } bank_start = (int)logtwo((double)g_ip->nuca_bank_count); iterations = bank_start+1; g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count; } cout << "Simulating various NUCA configurations\n"; for (it=bank_start; itnuca_cache_sz/g_ip->cache_sz; cout << "====" << g_ip->cache_sz << "\n"; for (wr=wt_min; wr<=wt_max; wr++) { for (ro=0; roflit_size; //initialize router nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time; /* calculate router and wire parameters */ double vlength = ures.cache_ht; /* length of the wire (u)*/ double hlength = ures.cache_len; // u /* find delay, area, and power for wires */ wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength); wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength); hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay, 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); ver_hop_lat = calc_cycles(wire_vertical[wr]->delay, 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); /* * assume a grid like topology and explore for optimal network * configuration using different row and column count values. */ for (c=1; c<=(unsigned int)bank_count; c++) { while (bank_count%c != 0) c++; r = bank_count/c; /* * to find the avg access latency of a NUCA cache, uncontended * access time to each bank from the * cache controller is calculated. * avg latency = * sum of the access latencies to individual banks)/bank * count value. */ totno_hops = totno_hhops = totno_vhops = tot_lat = 0; k = 1; for (i=0; idelay*avg_hop) + calc_cycles(ures.access_time, 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); /* avg access lat of nuca */ avg_dyn_power = avg_hop * (router_s[ro]->power.readOp.dynamic) + avg_hhop * (wire_horizontal[wr]->power.readOp.dynamic) * (g_ip->block_sz*8 + 64) + avg_vhop * (wire_vertical[wr]->power.readOp.dynamic) * (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic; avg_leakage_power = bank_count * router_s[ro]->power.readOp.leakage + avg_hhop * (wire_horizontal[wr]->power.readOp.leakage* wire_horizontal[wr]->delay) * flit_width + avg_vhop * (wire_vertical[wr]->power.readOp.leakage * wire_horizontal[wr]->delay); if (curr_acclat < opt_acclat) { opt_acclat = curr_acclat; opt_tot_lat = tot_lat; opt_avg_lat = avg_lat; opt_totno_hops = totno_hops; opt_avg_hop = avg_hop; opt_rows = r; opt_columns = c; opt_dyn_power = avg_dyn_power; opt_leakage_power = avg_leakage_power; } totno_hops = 0; tot_lat = 0; totno_hhops = 0; totno_vhops = 0; } nuca_list.back()->wire_pda.power.readOp.dynamic = opt_avg_hop * flit_width * (wire_horizontal[wr]->power.readOp.dynamic + wire_vertical[wr]->power.readOp.dynamic); nuca_list.back()->avg_hops = opt_avg_hop; /* network delay/power */ nuca_list.back()->h_wire = wire_horizontal[wr]; nuca_list.back()->v_wire = wire_vertical[wr]; nuca_list.back()->router = router_s[ro]; /* bank delay/power */ nuca_list.back()->bank_pda.delay = ures.access_time; nuca_list.back()->bank_pda.power = ures.power; nuca_list.back()->bank_pda.area.h = ures.cache_ht; nuca_list.back()->bank_pda.area.w = ures.cache_len; nuca_list.back()->bank_pda.cycle_time = ures.cycle_time; num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/, 1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/)); if(num_cyc%2 != 0) num_cyc++; if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles if (it < 7) { nuca_list.back()->nuca_pda.delay = opt_acclat + cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; nuca_list.back()->contention = cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; } else { nuca_list.back()->nuca_pda.delay = opt_acclat + cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; nuca_list.back()->contention = cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; } nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power; nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power; /* array organization */ nuca_list.back()->bank_count = bank_count; nuca_list.back()->rows = opt_rows; nuca_list.back()->columns = opt_columns; calculate_nuca_area (nuca_list.back()); minval.update_min_values(nuca_list.back()); nuca_list.push_back(new nuca_org_t()); opt_acclat = BIGNUM; } } g_ip->cache_sz /= 2; } delete(nuca_list.back()); nuca_list.pop_back(); opt_n = find_optimal_nuca(&nuca_list, &minval); print_nuca(opt_n); g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count; list::iterator niter; for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter) { delete *niter; } nuca_list.clear(); for(int i=0; i < ROUTER_TYPES; i++) { delete router_s[i]; } g_ip->display_ip(); // g_ip->force_cache_config = true; // g_ip->ndwl = 8; // g_ip->ndbl = 16; // g_ip->nspd = 4; // g_ip->ndcm = 1; // g_ip->ndsam1 = 8; // g_ip->ndsam2 = 32; } void Nuca::print_nuca (nuca_org_t *fr) { printf("\n---------- CACTI version 6.5, Non-uniform Cache Access " "----------\n\n"); printf("Optimal number of banks - %d\n", fr->bank_count); printf("Grid organization rows x columns - %d x %d\n", fr->rows, fr->columns); printf("Network frequency - %g GHz\n", (1/fr->nuca_pda.cycle_time)*1e3); printf("Cache dimension (mm x mm) - %g x %g\n", fr->nuca_pda.area.h*1e-3, fr->nuca_pda.area.w*1e-3); fr->router->print_router(); printf("\n\nWire stats:\n"); if (fr->h_wire->wt == Global) { printf("\tWire type - Full swing global wires with least " "possible delay\n"); } else if (fr->h_wire->wt == Global_5) { printf("\tWire type - Full swing global wires with " "5%% delay penalty\n"); } else if (fr->h_wire->wt == Global_10) { printf("\tWire type - Full swing global wires with " "10%% delay penalty\n"); } else if (fr->h_wire->wt == Global_20) { printf("\tWire type - Full swing global wires with " "20%% delay penalty\n"); } else if (fr->h_wire->wt == Global_30) { printf("\tWire type - Full swing global wires with " "30%% delay penalty\n"); } else if(fr->h_wire->wt == Low_swing) { printf("\tWire type - Low swing wires\n"); } printf("\tHorizontal link delay - %g (ns)\n", fr->h_wire->delay*1e9); printf("\tVertical link delay - %g (ns)\n", fr->v_wire->delay*1e9); printf("\tDelay/length - %g (ns/mm)\n", fr->h_wire->delay*1e9/fr->bank_pda.area.w); printf("\tHorizontal link energy -dynamic/access %g (nJ)\n" "\t -leakage %g (nW)\n\n", fr->h_wire->power.readOp.dynamic*1e9, fr->h_wire->power.readOp.leakage*1e9); printf("\tVertical link energy -dynamic/access %g (nJ)\n" "\t -leakage %g (nW)\n\n", fr->v_wire->power.readOp.dynamic*1e9, fr->v_wire->power.readOp.leakage*1e9); printf("\n\n"); fr->v_wire->print_wire(); printf("\n\nBank stats:\n"); } nuca_org_t * Nuca::find_optimal_nuca (list *n, min_values_t *minval) { double cost = 0; double min_cost = BIGNUM; nuca_org_t *res = NULL; float d, a, dp, lp, c; int v; dp = g_ip->dynamic_power_wt_nuca; lp = g_ip->leakage_power_wt_nuca; a = g_ip->area_wt_nuca; d = g_ip->delay_wt_nuca; c = g_ip->cycle_time_wt_nuca; list::iterator niter; for (niter = n->begin(); niter != n->end(); niter++) { fprintf(stderr, "\n-----------------------------" "---------------\n"); printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t " "bank_dpower = %g \tleak = %g \tcycle = %g\n", (*niter)->bank_count, (*niter)->nuca_pda.delay, (*niter)->nuca_pda.power.readOp.dynamic, (*niter)->h_wire->wt, (*niter)->bank_pda.power.readOp.dynamic, (*niter)->nuca_pda.power.readOp.leakage, (*niter)->nuca_pda.cycle_time); if (g_ip->ed == 1) { cost = ((*niter)->nuca_pda.delay/minval->min_delay)* ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); if (min_cost > cost) { min_cost = cost; res = ((*niter)); } } else if (g_ip->ed == 2) { cost = ((*niter)->nuca_pda.delay/minval->min_delay)* ((*niter)->nuca_pda.delay/minval->min_delay)* ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); if (min_cost > cost) { min_cost = cost; res = ((*niter)); } } else { /* * check whether the current organization * meets the input deviation constraints */ v = check_nuca_org((*niter), minval); if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling if (v) { cost = (d * ((*niter)->nuca_pda.delay/minval->min_delay) + c * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) + dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) + lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) + a * ((*niter)->nuca_pda.area.get_area()/minval->min_area)); fprintf(stderr, "cost = %g\n", cost); if (min_cost > cost) { min_cost = cost; res = ((*niter)); } } else { niter = n->erase(niter); if (niter !=n->begin()) niter --; } } } return res; } int Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval) { if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) { return 0; } if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 > g_ip->dynamic_power_dev_nuca) { return 0; } if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 > g_ip->leakage_power_dev_nuca) { return 0; } if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 > g_ip->cycle_time_dev_nuca) { return 0; } if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 > g_ip->area_dev_nuca) { return 0; } return 1; } void Nuca::calculate_nuca_area (nuca_org_t *nuca) { nuca->nuca_pda.area.h= nuca->rows * ((nuca->h_wire->wire_width + nuca->h_wire->wire_spacing) * nuca->router->flit_size + nuca->bank_pda.area.h); nuca->nuca_pda.area.w = nuca->columns * ((nuca->v_wire->wire_width + nuca->v_wire->wire_spacing) * nuca->router->flit_size + nuca->bank_pda.area.w); }