#include #include "lib.hpp" #include #include #include "lut.hpp" #include "arnoldi.hpp" using std::isnan; // returns v __ulib_inline f32 calc_waveform_grad( f32 t, f32 dt, const f32 poles[Q], const f32 residues[Q], f32 &g_t, f32 &g_dt ) { if(t < 0.f) { g_t = g_dt = 0.f; return 0.f; } f32 t1 = fmaxf(0.f, t - dt); f32 v = t - t1; for(u32 i = 0; i < Q; ++i) { v -= residues[i] * (expf(t * poles[i]) - expf(t1 * poles[i])) / poles[i]; } v /= dt; g_dt = -v / dt; g_t = 1.f; for(u32 i = 0; i < Q; ++i) { g_t -= residues[i] * expf(t * poles[i]); } g_t /= dt; if(t >= dt) { f32 g_left = 1.f; for(u32 i = 0; i < Q; ++i) { g_left -= residues[i] * expf((t - dt) * poles[i]); } g_left /= dt; g_t -= g_left; g_dt += g_left; } return v; } // 3x3 matrix determinant // to be used in cramer's rule linear solving __ulib_inline f32 det_3x3( f32 a00, f32 a01, f32 a02, f32 a10, f32 a11, f32 a12, f32 a20, f32 a21, f32 a22 ) { return a00 * (a11 * a22 - a12 * a21) - a01 * (a10 * a22 - a12 * a20) + a02 * (a10 * a21 - a11 * a20); } __ulib_inline void update_merge( f32 &old, f32 nw, MergingStrategy merging_strategy ) { // supports old==NAN where we always update. if((merging_strategy == MergingStrategy_MIN && !(old < nw)) || (merging_strategy == MergingStrategy_MAX && !(old > nw))) old = nw; } __ulib_inline f32 calc_single_timepoint_nr( f32 dt, const f32 poles[Q], const f32 residues[Q], f32 v, f32 init_t ) { f32 t = init_t; for(u32 i = 0; i < 20; ++i) { f32 f, g_t, _g_dt; f = calc_waveform_grad(t, dt, poles, residues, g_t, _g_dt) - v; if(fabsf(f) < 1e-3) { // converged timepoint break; } t -= f / g_t; } return t; } extern "C" void sta_dmp_prop_cpu( // level usize level_l, usize level_r, const usize *levels_nd, // graph const STAGraphArc *arcs, const usize *arcs_st, // pin to net index const usize *pin2net, // library pin, timing arcs, and luts const LibPinInfo *pininfo, const f32 *lib_tables, const TimingArcPointer *lib_timing_arcs, // metadata: merging strategy and rf // we only deal with one rf in one kernel. MergingStrategy merging_strategy, u8 trf, // ct-arnoldi roms, dedicated to that single rf const f32 *driver_rd, const f32 *total_caps_for_nets, const f32 (*poles)[Q], const f32 (*residues)[Q], // temporary space for ceff, t0, dt, arc_slew storage // // size = #arcs * 16 // 16 = frf(2) * trf(2) * {dt, t0}(2) f32 (*cellarcs_tmps)[8], // outputs f32 (*arcs_delay)[4], f32 (*slews)[2] ) { for(usize level_i = level_l; level_i < level_r; ++level_i) { usize cur = levels_nd[level_i]; usize curnet = pin2net[cur]; f32 *cpole = (f32 *)poles[curnet], *cresidue = (f32 *)residues[cur]; f32 Rd = driver_rd[curnet]; usize arci = arcs_st[cur], arce = arcs_st[cur + 1]; if(arci != arce) { // reset slew if nontrivial fanin slews[cur][trf] = f32_NAN; } f32 th_delay = pininfo[cur].delay_threshold[trf]; f32 th_slew1 = pininfo[cur].slew_threshold[trf][0], th_slew2 = pininfo[cur].slew_threshold[trf][1]; for(; arci < arce; ++arci) { usize prev = arcs[arci].prev; usize arcid = arcs[arci].arcid; arcs_delay[arci][trf] = arcs_delay[arci][2 | trf] = f32_NAN; if(arcid != usize_MAX) { // ###### CELL ARC f32 total_caps = total_caps_for_nets[curnet]; for(u8 frf = 0; frf < 2; ++frf) { f32 fslew = slews[prev][frf]; if(isnan(fslew)) continue; LUTReference ld = lib_timing_arcs[arcid].delay[frf][trf], ls = lib_timing_arcs[arcid].slew[frf][trf]; if(ld.data_start == usize_MAX || ls.data_start == usize_MAX) { continue; } f32 init_delay = lut_query( lib_tables, ld, total_caps, fslew ); f32 init_slew = lut_query( lib_tables, ls, total_caps, fslew ); // if trivial downstream net? if(isnan(Rd)) { arcs_delay[arci][frf << 1 | trf] = init_delay; update_merge(slews[cur][trf], init_slew, merging_strategy); continue; } // =================== test_Ceff_smallNR // NR initials f32 Ceff = total_caps; f32 dt = init_slew / (th_slew2 - th_slew1); f32 t0 = init_delay - 0.69f * Rd * Ceff - dt / 2.f; f32 tr1 = init_delay - init_slew * (1.f - th_slew1); // Ceff iter f32 delay_old = f32_NAN, slew_old = f32_NAN, delay, slew; for(u32 ceff_i = 0; ceff_i < 20; ++ceff_i) { delay = lut_query(lib_tables, ld, Ceff, fslew); slew = lut_query(lib_tables, ls, Ceff, fslew); if(fabsf(delay - delay_old) / delay_old < 1e-3 && fabsf(slew - slew_old) / slew_old < 1e-3) { // CONVERGED (1‰). STOP. break; } // NR iter for(u32 nr_i = 0; nr_i < 20; ++nr_i) { f32 fv0, fv1, fv2, gwf1t, gwf1dt, gwf2t, gwf2dt, gwf3t, gwf3dt; fv0 = calc_waveform_grad( delay - t0, dt, cpole, cresidue, gwf1t, gwf1dt) - th_delay; fv1 = calc_waveform_grad( tr1 - t0, dt, cpole, cresidue, gwf2t, gwf2dt) - th_slew1; fv2 = calc_waveform_grad( tr1 + slew - t0, dt, cpole, cresidue, gwf3t, gwf3dt) - th_slew2; if(fabsf(fv0) < 1e-3 && fabsf(fv1) < 1e-3 && fabsf(fv2) < 1e-3) { // smallNR EARLY STOP. break; } f32 detj = det_3x3( gwf1dt, -gwf1t, 0.f, gwf2dt, -gwf2t, gwf2t, gwf3dt, -gwf3t, gwf3t ); if(fabsf(detj) < 1e-8) { printf("[WARN] NR singular\n"); break; } f32 d0 = det_3x3( fv0, -gwf1t, 0.f, fv1, -gwf2t, gwf2t, fv2, -gwf3t, gwf3t ); f32 d1 = det_3x3( gwf1dt, fv0, 0.f, gwf2dt, fv1, gwf2t, gwf3dt, fv2, gwf3t ); f32 d2 = det_3x3( gwf1dt, -gwf1t, fv0, gwf2dt, -gwf2t, fv1, gwf3dt, -gwf3t, fv2 ); dt -= d0 / detj; t0 -= d1 / detj; tr1 -= d2 / detj; if(dt <= 0.f) { printf("[WARN] NR too small dt, bad Rd\n"); break; } } f32 Qpidt = 0.f; for(u32 i = 0; i < Q; ++i) { Qpidt += cresidue[i] * ( -1.f + expf(dt * cpole[i]) - dt * cpole[i] ) / (dt * cpole[i] * cpole[i] * Rd); } f32 QCeff_quad_coeff = Rd * ( -1.f + expf(-dt / (Ceff * Rd)) ) / dt; f32 Ceff_new = ( -1.f + sqrtf(1.f + 4.f * QCeff_quad_coeff * Qpidt)) / (2.f * QCeff_quad_coeff); if(Ceff_new > Ceff) break; Ceff = Ceff_new; delay_old = delay; slew_old = slew; } f32 *cellarcs_tmp = (f32 *)(cellarcs_tmps + arci) + (frf << 1 | trf) * 2; cellarcs_tmp[0] = dt; cellarcs_tmp[1] = t0; arcs_delay[arci][frf << 1 | trf] = delay; update_merge(slews[cur][trf], slew, merging_strategy); // =================== end test_Ceff_smallNR } } else { // ###### NET ARC // if trivial downstream net? if(isnan(Rd)) { arcs_delay[arci][trf << 1 | trf] = 0.; slews[cur][trf] = slews[prev][trf]; continue; } f32 slew = slews[prev][trf]; if(isnan(slew)) continue; // calculate net delay and slew, by merging all driving // cell arcs usize farci = arcs_st[prev], farce = arcs_st[prev + 1]; if(farci == farce) { // no driver cell arc => PI // the Rd have been set to a very small value. f32 dt = slew / ( pininfo[prev].slew_threshold[trf][1] - pininfo[prev].slew_threshold[trf][0]); f32 t0 = dt / 2.; arcs_delay[arci][trf << 1 | trf] = calc_single_timepoint_nr( dt, cpole, cresidue, th_delay, dt) - t0; slews[cur][trf] = calc_single_timepoint_nr( dt, cpole, cresidue, th_slew2, dt) - calc_single_timepoint_nr( dt, cpole, cresidue, th_slew1, dt); continue; } for(; farci < farce; ++farci) { for(u32 frf = 0; frf < 2; ++frf) { f32 delay = arcs_delay[farci][frf << 1 | trf]; if(isnan(delay)) continue; f32 dt = cellarcs_tmps[farci][(frf << 1 | trf) * 2]; f32 t0 = cellarcs_tmps[farci][(frf << 1 | trf) * 2 + 1]; update_merge( arcs_delay[arci][trf << 1 | trf], calc_single_timepoint_nr( dt, cpole, cresidue, th_delay, delay - t0) + t0 - delay, merging_strategy); update_merge( slews[cur][trf], calc_single_timepoint_nr( dt, cpole, cresidue, th_slew2, delay - t0) - calc_single_timepoint_nr( dt, cpole, cresidue, th_slew1, delay - t0), merging_strategy); } } } } } }