#include #include "lib.hpp" #include #include #include "arnoldi.hpp" #include "lut.hpp" using std::isnan; // template // __ulib_inline void debug_sparse_matrix(u32 n, const nnzint nnz[], const u32 p[], const f32 v[]) { // nnzint nnzstart = nnz[0]; // for(u32 i = 0; i < n; ++i) { // printf("{%d}: ", i); // for(u32 j = nnz[i]; j < nnz[i + 1]; ++j) { // printf("[%d]=%f ", p[j - nnzstart], v[j - nnzstart]); // } // putchar('\n'); // } // } /// solve linear equation (in-place) given the LDL decomposition. /// L is in CSC format w/o the diagonal 1s. __ulib_inline void lu_solve_(u32 n, f32 x[], const usize Lnnz[], const u32 Lp[], const f32 Lv[], const f32 D[]) { // lower diagonal solve (L) usize Lnnz0 = Lnnz[0]; for(u32 i = 0; i < n; ++i) { f32 xi = x[i]; for(u32 jp = Lnnz[i], jpe = Lnnz[i + 1]; jp < jpe; ++jp) { x[Lp[jp - Lnnz0]] -= Lv[jp - Lnnz0] * xi; } } // diagonal solve (D) for(u32 i = 0; i < n; ++i) x[i] /= D[i]; // upper diagonal solve (L^T) for(u32 i = n - 1; i != u32_MAX; --i) { f32 xi = x[i]; for(u32 jp = Lnnz[i], jpe = Lnnz[i + 1]; jp < jpe; ++jp) { xi -= Lv[jp - Lnnz0] * x[Lp[jp - Lnnz0]]; } x[i] = xi; } } /// matrix-vector multiply. /// assumes y[] initialized as zero. /// given matrix is in CSC format. __ulib_inline void mvmul(u32 n, const f32 x[], const u32 nnz[], const u32 p[], const f32 v[], f32 y[]) { for(u32 i = 0; i < n; ++i) { f32 xi = x[i]; for(u32 jp = nnz[i], jpe = nnz[i + 1]; jp < jpe; ++jp) { y[p[jp]] += v[jp] * xi; } } } __ulib_inline f32 dot(u32 n, const f32 a[], const f32 b[]) { f32 r = 0.; for(u32 k = 0; k < n; ++k) r += a[k] * b[k]; return r; } /// compute QR decomposition of Aq and store them in Qq and Rq. /// this is non-traditional because the result is A=R*Q, not A=Q*R, /// and R is lower-triangular, not upper-triangular. /// assumes Aq is nonsingular. /// If Aq is singular, it should be offsetted (shifted) by +lambda*I. __ulib_inline void QRdecQ(const f32 Aq[Q][Q], f32 Qq[Q][Q], f32 Rq[Q][Q]) { // init for(u32 i = 0; i < Q; ++i) { for(u32 j = 0; j < Q; ++j) { Qq[i][j] = Aq[i][j]; Rq[i][j] = 0.; } } // Gram-Schmidt for(u32 i = 0; i < Q; ++i) { for(u32 j = 0; j < i; ++j) { f32 r = 0.; for(u32 k = 0; k < Q; ++k) r += Qq[j][k] * Aq[i][k]; Rq[i][j] = r; for(u32 k = 0; k < Q; ++k) Qq[i][k] -= r * Qq[j][k]; } // normalize current f32 norm = 0.; for(u32 k = 0; k < Q; ++k) norm += Qq[i][k] * Qq[i][k]; norm = sqrt(norm); assert(fabsf(norm) > 1e-8); // reject singularity Rq[i][i] = norm; for(u32 k = 0; k < Q; ++k) Qq[i][k] /= norm; } } /// C = A @ B /// cannot be used in-place. __ulib_inline void matmulQ(const f32 A[Q][Q], const f32 B[Q][Q], f32 C[Q][Q]) { for(u32 i = 0; i < Q; ++i) for(u32 j = 0; j < Q; ++j) { f32 r = 0.; for(u32 k = 0; k < Q; ++k) r += A[i][k] * B[k][j]; C[i][j] = r; } } /// estimate the driving resistance (Rd) for every net, given /// the (with initial) slew array and the fanin cell arcs. extern "C" void sta_arom_estimate_rd_cpu( usize num_nets, // parasitic elements const usize *num_elems, const ParasiticElement *elem_arr, // net pins const usize *net_pin_start, const usize *net_pins, // pin info const LibPinInfo *pin_infos, const f32 (*slews)[2], // sta graph info const STAGraphArc *arcs, const usize *arcs_st, const f32 *lib_tables, const TimingArcPointer *lib_timing_arcs, // use rise(0) or fall(1) slew & caps. u8 rf, // result: driver rd and total caps f32 *driver_rds, f32 *total_caps_for_nets ) { for(usize net_id = 0; net_id < num_nets; ++net_id) { usize prev_elems = num_elems[net_id]; u32 n_elem = (u32)(num_elems[net_id + 1] - prev_elems); const ParasiticElement *elem = elem_arr + prev_elems; f32 total_caps = 0., min_res = 1e10; for(u32 i = 0; i < n_elem; ++i) { if(elem[i].typ == ParasiticElementType_C && elem[i].b == u32_MAX) { total_caps += elem[i].value; } else if(elem[i].typ == ParasiticElementType_R) { min_res = fminf(min_res, elem[i].value); } } for(usize pini = net_pin_start[net_id], pine = net_pin_start[net_id + 1]; pini < pine; ++pini) { total_caps += pin_infos[net_pins[pini]].caps[rf]; } total_caps_for_nets[net_id] = total_caps; if(n_elem == 0) { // skip nets without parasitics driver_rds[net_id] = f32_NAN; continue; } usize driver_pin = net_pins[net_pin_start[net_id]]; f32 slew = slews[driver_pin][rf]; usize arci = arcs_st[driver_pin], arce = arcs_st[driver_pin + 1]; if(arci == arce) { // no driver cell arc => PI // set a very small Rd. driver_rds[net_id] = min_res / 1e3; continue; } if(isnan(slew)) { // get an estimated slew from previous cell arcs for(; arci < arce; ++arci) { usize arcid = arcs[arci].arcid; assert(arcid != usize_MAX); for(u32 frf = 0; frf < 2; ++frf) { LUTReference lutref = lib_timing_arcs[arcid].slew[frf][rf]; if(lutref.data_start == usize_MAX) continue; // query the table using total_cap and minimum slew index. f32 slewval = lut_query( lib_tables, lutref, total_caps, lib_tables[lutref.data_start + lutref.axis1]); // get minimum possible slew // do not reverse this, as we should handle nan in original slew. if(!(slewval > slew)) slew = slewval; } } if(isnan(slew)) slew = 1.; // give up and feed a dummy value. } LibPinInfo pininfo = pin_infos[driver_pin]; // printf("driver_rds[%lu] calculated by slew %f total_caps %f threshold %f %f\n", // net_id, slew, total_caps, pininfo.slew_threshold[rf][1], pininfo.slew_threshold[rf][0]); driver_rds[net_id] = slew / (total_caps * log( pininfo.slew_threshold[rf][1] / pininfo.slew_threshold[rf][0])) / 4.; } } /// construct modified nodal analysis systems (matrices G and C) /// for every net, GIVEN a single condition in rise/fall. /// for both rise and fall, run the kernel series twice with /// different memory spaces. /// /// let: /// num_nets = #nets /// n = #rc network nodes for a net /// nelem = #rc elements for a net /// nmat = nelem * 2 + n = the upper bound of matrix nonzeros /// /// prep_ints contains the following arrays: /// radixcnt[n + 1], radixorder1[nelem], radixorder2[nelem], Gfed[n], Cfed[n] /// prep_floats contains the following arrays: /// diagG[n], diagC[n] /// /// mna_ints contains: /// Gnnz[n + 1], Gp[nmat], Cnnz[n + 1], Cp[nmat] /// mna_floats contains: /// Gv[nmat], Cv[nmat] extern "C" void sta_arom_construct_mna_cpu( usize num_nets, const usize *num_nodes, const usize *num_elems, const ParasiticElement *elem_arr, // net pins const usize *net_pin_start, const usize *net_pins, // pin info, for introducing additional caps in MNA. const LibPinInfo *pin_infos, u8 rf, const f32 *driver_rds, u32 *prep_ints, f32 *prep_floats, u32 *mna_ints, f32 *mna_floats ) { for(usize net_id = 0; net_id < num_nets; ++net_id) { usize prev_nd = num_nodes[net_id]; u32 n = (u32)(num_nodes[net_id + 1] - prev_nd); usize prev_elems = num_elems[net_id]; u32 n_elem = (u32)(num_elems[net_id + 1] - prev_elems); u32 max_nmat = n_elem * 2 + n; if(n_elem == 0) continue; // skip nets without parasitics const ParasiticElement *elem = elem_arr + prev_elems; u32 *radixcnt = prep_ints + (net_id + prev_nd * 3 + prev_elems * 2); u32 *radixorder1 = radixcnt + n + 1; u32 *radixorder2 = radixorder1 + n_elem; u32 *Gfed = radixorder2 + n_elem, *Cfed = Gfed + n; f32 *diagG = prep_floats + prev_nd * 2, *diagC = diagG + n; u32 *Gnnz = mna_ints + (net_id * 2 + prev_nd * 4 + prev_elems * 4); u32 *Gp = Gnnz + n + 1; u32 *Cnnz = Gp + max_nmat; u32 *Cp = Cnnz + n + 1; f32 *Gv = mna_floats + (prev_nd * 2 + prev_elems * 4); f32 *Cv = Gv + max_nmat; // build matrix: preprocess -> csc // calculate nnz for G and C. for(u32 i = 1; i <= n; ++i) Gnnz[i] = Cnnz[i] = 1; // diag for(u32 i = 0; i < n_elem; ++i) { if(elem[i].typ == ParasiticElementType_R) { ++Gnnz[elem[i].a + 1]; ++Gnnz[elem[i].b + 1]; } else { if(elem[i].b == u32_MAX) continue; ++Cnnz[elem[i].a + 1]; ++Cnnz[elem[i].b + 1]; } } for(u32 i = 1; i <= n; ++i) { Gnnz[i] += Gnnz[i - 1]; Cnnz[i] += Cnnz[i - 1]; } // sort by: (b + 1, a + 1). (we already guaranteed a < b.) // +1 is for u32_MAX to become zero. // radix sort round 1: sort by a. for(u32 i = 0; i < n_elem; ++i) { u32 k = elem[i].a + 1; ++radixcnt[k]; } for(u32 i = 1; i <= n; ++i) radixcnt[i] += radixcnt[i - 1]; for(u32 i = n_elem - 1; i != u32_MAX; --i) { u32 k = elem[i].a + 1; radixorder1[--radixcnt[k]] = i; } // radix sort round 2: sort by b. for(u32 i = 0; i <= n; ++i) radixcnt[i] = 0; for(u32 i = 0; i < n_elem; ++i) { u32 k = elem[i].b + 1; ++radixcnt[k]; } for(u32 i = 1; i <= n; ++i) radixcnt[i] += radixcnt[i - 1]; for(u32 p = n_elem - 1; p != u32_MAX; --p) { u32 i = radixorder1[p]; u32 k = elem[i].b + 1; radixorder2[--radixcnt[k]] = i; } // feed the upper triangular elements into G and C. // and store the diagonal element in tmp arrays diagG[0] = (f32)1. / driver_rds[net_id]; for(u32 pos = 0; pos < n_elem; ++pos) { u32 i = radixorder2[pos]; if(elem[i].typ == ParasiticElementType_R) { assert(elem[i].value > 1e-7); // reject too small resistances? f32 invr = (f32)1. / elem[i].value; u32 a = elem[i].a, b = elem[i].b; diagG[a] += invr; diagG[b] += invr; u32 p = Gnnz[b] + Gfed[b]++; Gp[p] = a; Gv[p] = -invr; } else { f32 c = elem[i].value; u32 a = elem[i].a, b = elem[i].b; diagC[a] += c; if(b == u32_MAX) continue; diagC[b] += c; u32 p = Cnnz[b] + Cfed[b]++; Cp[p] = a; Cv[p] = -c; } } // feed the additional caps usize pin_st = net_pin_start[net_id]; u32 num_pins = (u32)(net_pin_start[net_id + 1] - pin_st); for(u32 i = 0; i < num_pins; ++i) { diagC[i] += pin_infos[net_pins[pin_st + i]].caps[rf]; } // append the diagonal elements for(u32 i = 0; i < n; ++i) { u32 p = Gnnz[i] + Gfed[i]++; Gp[p] = i; Gv[p] = diagG[i]; p = Cnnz[i] + Cfed[i]++; Cp[p] = i; Cv[p] = diagC[i]; } // append the transpose for these symmetric matrices for(u32 i = 0; i < n; ++i) { for(u32 j = 0, je = Gfed[i] - 1; j < je; ++j) { u32 jp = Gp[Gnnz[i] + j]; f32 jv = Gv[Gnnz[i] + j]; u32 p = Gnnz[jp] + Gfed[jp]++; Gp[p] = i; Gv[p] = jv; } for(u32 j = 0, je = Cfed[i] - 1; j < je; ++j) { u32 jp = Cp[Cnnz[i] + j]; f32 jv = Cv[Cnnz[i] + j]; u32 p = Cnnz[jp] + Cfed[jp]++; Cp[p] = i; Cv[p] = jv; } } // debug output the matrices // printf("G matrix = \n"); // debug_sparse_matrix(n, Gnnz, Gp, Gv); // printf("C matrix = \n"); // debug_sparse_matrix(n, Cnnz, Cp, Cv); } } /// symbolic ldl decomposition. /// /// mna_ints and mna_floats follow previous allocation layout. /// lutmp_ints contains: /// parent[n], flag[n], Lfed[n], Lorder[n] /// Lnnzs allocated with sum(n) + 1, and prefix-summed. extern "C" void sta_arom_ldl_symbolic_cpu( usize num_nets, const usize *num_nodes, const usize *num_elems, const u32 *mna_ints, u32 *lutmp_ints, usize *Lnnzs ) { for(usize net_id = 0; net_id < num_nets; ++net_id) { usize prev_nd = num_nodes[net_id]; u32 n = (u32)(num_nodes[net_id + 1] - prev_nd); usize prev_elems = num_elems[net_id]; u32 n_elem = (u32)(num_elems[net_id + 1] - prev_elems); if(n_elem == 0) continue; // skip nets without parasitics const u32 *Gnnz = mna_ints + (net_id * 2 + prev_nd * 4 + prev_elems * 4); const u32 *Gp = Gnnz + n + 1; u32 *parent = lutmp_ints + prev_nd * 4; u32 *flag = parent + n; usize *Lnnz = Lnnzs + prev_nd; for(u32 k = 0; k < n; ++k) { parent[k] = u32_MAX; flag[k] = k; for(u32 p = Gnnz[k], pe = Gnnz[k + 1]; p < pe; ++p) { u32 i = Gp[p]; if(i >= k) break; for(; flag[i] != k; i = parent[i]) { ++Lnnz[i + 1]; // column i has nonzero at row k. flag[i] = k; if(parent[i] == u32_MAX) parent[i] = k; } } } } // prefix sum. // should also be parallelized? for(usize i = 1, total_nd = num_nodes[num_nets]; i <= total_nd; ++i) { Lnnzs[i] += Lnnzs[i - 1]; // printf("debug: Lnnzs[%lu] = %lu\n", i, Lnnzs[i]); } } /// numerical ldl decomposition. /// /// mna_ints, mna_floats, lutmp_ints, Lnnzs follow previous allocation layout. /// Y_arr: sum(n) /// Lp_arr: Lnnzs[num_nodes] /// lu_floats contains: /// D[n], Lv[Lnnz[n]] extern "C" void sta_arom_ldl_numerical_cpu( usize num_nets, const usize *num_nodes, const usize *num_elems, const u32 *mna_ints, const f32 *mna_floats, u32 *lutmp_ints, const usize *Lnnzs, f32 *Y_arr, u32 *Lp_arr, f32 *lu_floats ) { for(usize net_id = 0; net_id < num_nets; ++net_id) { usize prev_nd = num_nodes[net_id]; u32 n = (u32)(num_nodes[net_id + 1] - prev_nd); usize prev_elems = num_elems[net_id]; u32 n_elem = (u32)(num_elems[net_id + 1] - prev_elems); if(n_elem == 0) continue; // skip nets without parasitics const u32 *Gnnz = mna_ints + (net_id * 2 + prev_nd * 4 + prev_elems * 4); const u32 *Gp = Gnnz + n + 1; const f32 *Gv = mna_floats + (prev_nd * 2 + prev_elems * 4); u32 *parent = lutmp_ints + prev_nd * 4; u32 *flag = parent + n; u32 *Lfed = flag + n; u32 *Lorder = Lfed + n; const usize *Lnnz = Lnnzs + prev_nd; usize Lnnz0 = Lnnz[0]; u32 *Lp = Lp_arr + Lnnz[0]; f32 *Y = Y_arr + prev_nd; f32 *D = lu_floats + Lnnz[0] + prev_nd; f32 *Lv = D + n; for(u32 k = 0; k < n; ++k) { u32 top = n; flag[k] = k; for(u32 p = Gnnz[k], pe = Gnnz[k + 1]; p < pe; ++p) { u32 i = Gp[p]; if(i > k) break; u32 len = 0; Y[i] += Gv[p]; for(; flag[i] != k; i = parent[i]) { Lorder[len++] = i; flag[i] = k; } // reverse the inserted part while(len) Lorder[--top] = Lorder[--len]; } // solve row k. D[k] = Y[k]; Y[k] = 0; while(top < n) { u32 i = Lorder[top++]; f32 yi = Y[i]; Y[i] = 0; u32 p, pe; for(p = Lnnz[i], pe = Lnnz[i] + Lfed[i]; p < pe; ++p) { Y[Lp[p - Lnnz0]] -= Lv[p - Lnnz0] * yi; } f32 lki = yi / D[i]; D[k] -= lki * yi; ++Lfed[i]; Lp[pe - Lnnz0] = k; Lv[pe - Lnnz0] = lki; } assert(fabsf(D[k]) > 1e-8); // reject singularity } // printf("L matrix = \n"); // debug_sparse_matrix(n, Lnnz, Lp, Lv); // printf("D = "); // for(u32 i = 0; i < n; ++i) printf("%f ", D[i]); // putchar('\n'); } } /// krylov, evd, and poles/residues calculation /// /// mna_{ints, floats}, Lnnzs, Lp_arr, lu_floats /// follow previous allocation layout. /// /// net_pin_start, net_pins follow net-pin CSR. /// poles_arr: [num_nets][Q] /// residues_arr: [num_pins][Q] /// /// krylov_temps contains: /// Hq[Q][Q], Rq[Q][Q], Qq[Q][Q], Evecq[Q][Q], /// Zq[Q][n], Uq[Q + 1][n], extern "C" void sta_arom_krylov_evd_poleres_cpu( usize num_nets, const usize *num_nodes, const usize *num_elems, const f32 *driver_rds, const usize *net_pin_start, const usize *net_pins, const u32 *mna_ints, const f32 *mna_floats, const usize *Lnnzs, const u32 *Lp_arr, const f32 *lu_floats, f32 *krylov_temps, f32 (*poles_arr)[Q], f32 (*residues_arr)[Q] ) { for(usize net_id = 0; net_id < num_nets; ++net_id) { usize prev_nd = num_nodes[net_id]; u32 n = (u32)(num_nodes[net_id + 1] - prev_nd); usize prev_elems = num_elems[net_id]; u32 n_elem = (u32)(num_elems[net_id + 1] - prev_elems); u32 max_nmat = n_elem * 2 + n; f32 *poles = poles_arr[net_id]; if(n_elem == 0) { // skip nets without parasitics. assign NAN for their poles. for(u32 k = 0; k < Q; ++k) poles[k] = f32_NAN; continue; } const u32 *Cnnz = mna_ints + (net_id * 2 + prev_nd * 4 + prev_elems * 4) + n + 1 + max_nmat; const u32 *Cp = Cnnz + n + 1; const f32 *Cv = mna_floats + (prev_nd * 2 + prev_elems * 4) + max_nmat; const usize *Lnnz = Lnnzs + prev_nd; const u32 *Lp = Lp_arr + Lnnz[0]; const f32 *D = lu_floats + Lnnz[0] + prev_nd; const f32 *Lv = D + n; f32 *krylov_temps_st = krylov_temps + (Q * Q * 4 * net_id + (Q * 2 + 1) * prev_nd); f32 (*Hq)[Q] = (f32(*)[Q])krylov_temps_st; f32 (*Rq)[Q] = (f32(*)[Q])(krylov_temps_st + Q * Q); f32 (*Qq)[Q] = (f32(*)[Q])(krylov_temps_st + Q * Q * 2); f32 (*Evecq)[Q] = (f32(*)[Q])(krylov_temps_st + Q * Q * 3); f32 *ZqF = krylov_temps_st + Q * Q * 4; f32 *UqF = krylov_temps_st + Q * Q * 4 + Q * n; // ctarnoldi: krylov UqF[0] = (f32)1. / driver_rds[net_id]; lu_solve_(n, UqF + 0, Lnnz, Lp, Lv, D); mvmul(n, UqF + 0, Cnnz, Cp, Cv, ZqF + 0); f32 h00 = sqrt(dot(n, UqF + 0, ZqF + 0)); // also later used in residues for(u32 k = 0; k < n; ++k) UqF[k] /= h00, ZqF[k] /= h00; for(u32 j = 1; j <= Q; ++j) { for(u32 k = 0; k < n; ++k) UqF[j * n + k] = -ZqF[(j - 1) * n + k]; lu_solve_(n, UqF + j * n, Lnnz, Lp, Lv, D); for(u32 i = (j <= 1 ? 0 : j - 2); i < j; ++i) { f32 t = dot(n, UqF + j * n, ZqF + i * n); Hq[i][j - 1] = t; for(u32 k = 0; k < n; ++k) UqF[j * n + k] -= t * UqF[i * n + k]; } if(j >= Q) break; mvmul(n, UqF + j * n, Cnnz, Cp, Cv, ZqF + j * n); f32 hjpj = sqrt(dot(n, UqF + j * n, ZqF + j * n)); Hq[j][j - 1] = hjpj; if(fabsf(hjpj) > 1e-5) { for(u32 k = 0; k < n; ++k) UqF[j * n + k] /= hjpj, ZqF[j * n + k] /= hjpj; } } // test Uq, Hq // puts("Hq = "); // for(u32 i = 0; i < Q; ++i) { // for(u32 j = 0; j < Q; ++j) printf("%f ", Hq[i][j]); // putchar('\n'); // } // puts("Uq = "); // for(u32 i = 0; i < Q; ++i) { // for(u32 j = 0; j < n; ++j) printf("%f ", UqF[i * n + j]); // putchar('\n'); // } // eig factor Hq // first make Hq non-singular by shifting it to diagonally dominant. // also, we only subtract, not add, to keep some order in resulting eigs. f32 hmaxabs = 0., hmaxoffs = 0.; if(n < Q) { for(u32 i = 0; i < Q; ++i) for(u32 j = 0; j < Q; ++j) { hmaxabs = fmaxf(hmaxabs, fabsf(Hq[i][j])); } hmaxabs *= (f32)0.01; for(u32 i = 0; i < Q; ++i) { f32 nondiag = 0.; for(u32 j = 0; j < Q; ++j) if(i != j) nondiag += fabsf(Hq[i][j]); hmaxoffs = fmaxf(hmaxoffs, nondiag * (f32)1.01 + hmaxabs + Hq[i][i]); } // printf("hmaxoffs to be: %f\n", hmaxoffs); } for(u32 i = 0; i < Q; ++i) { Hq[i][i] -= hmaxoffs; Evecq[i][i] = 1.; } // QR iterations for(u32 iter = 0; iter < 32; ++iter) { if(iter % 4 == 3) { // check convergence bool conv = true; for(u32 i = 0; i < Q && conv; ++i) for(u32 j = 0; j < Q; ++j) { if(i != j && fabsf(Hq[i][j]) > 1e-3) { conv = false; break; } } if(conv) { // printf("QR converged in %d iters\n", iter); break; } else if(iter == 31) { printf("[WARNING] QR does not converge..\n"); } } QRdecQ(Hq, Qq, Rq); matmulQ(Qq, Rq, Hq); matmulQ(Qq, Evecq, Rq); for(u32 i = 0; i < Q; ++i) for(u32 j = 0; j < Q; ++j) { Evecq[i][j] = Rq[i][j]; } } for(u32 i = 0; i < Q; ++i) Hq[i][i] += hmaxoffs; // debug eig // puts("eigenvalues = "); // for(u32 i = 0; i < Q; ++i) printf("%f ", Hq[i][i]); // putchar('\n'); // puts("eigenvectors (rows) = "); // for(u32 i = 0; i < Q; ++i) { // for(u32 j = 0; j < Q; ++j) printf("%f ", Evecq[i][j]); // putchar('\n'); // } usize netpinl = net_pin_start[net_id]; u32 npins = (u32)(net_pin_start[net_id + 1] - netpinl); for(u32 i = 0; i < Q; ++i) { poles[i] = (f32)1. / Hq[i][i]; for(u32 j = 0; j < npins; ++j) { f32 r = 0.; for(u32 k = 0; k < Q; ++k) r += UqF[k * n + j] * Evecq[i][k]; residues_arr[net_pins[netpinl + j]][i] = r * Evecq[i][0] * h00; } } } }