/* * Libmine core library. * * This code is written by Davide Albanese * and Michele Filosi . * * Copyright (C) 2012-2016 Davide Albanese, Copyright (C) 2012 Michele * Filosi, Copyright (C) 2012 Fondazione Bruno Kessler. * * References: * * Original MINE paper: DOI: 10.1126/science.1205438; * * Minepy paper: DOI: 10.1093/bioinformatics/bts707D; * * MIC_e and TIC: DOI: arXiv:1505.02213 and DOI: arXiv:1505.02214; * * GMIC: DOI: arXiv:1308.5712. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include "mine.h" char *libmine_version = LIBMINE_VERSION; #define MAX(a, b) (((a) > (b) ? (a) : (b))) #define MIN(a, b) (((a) < (b) ? (a) : (b))) #define TRUE 1 #define FALSE 0 void quicksort(double *a, int *idx, int l, int u) { int i, m, idx_temp; double a_temp; if (l >= u) return; m = l; for (i=l+1; i<=u; i++) { if (a[i] < a[l]) { ++m; idx_temp = idx[m]; idx[m] = idx[i]; idx[i] = idx_temp; a_temp = a[m]; a[m] = a[i]; a[i] = a_temp; } } idx_temp = idx[l]; idx[l] = idx[m]; idx[m] = idx_temp; a_temp = a[l]; a[l] = a[m]; a[m] = a_temp; quicksort(a, idx, l, m-1); quicksort(a, idx, m+1, u); } int *argsort(double *a, int n) { double *a_cpy; int i, *idx; a_cpy = (double *) malloc(n * sizeof(double)); if (a_cpy == NULL) return NULL; idx = (int *) malloc(n * sizeof(int)); if (idx == NULL) { free(a_cpy); return NULL; } /* fill a_cpy */ memcpy(a_cpy, a, n * sizeof(double)); /* fill idx */ for (i=0; i. See line 5 of Algorithm 2, SOM. * * Parameters * c : c_1, ..., c_p * c_log : log(c) * s : s in c_s * t : t in c_t */ double hp3(int *c, double *c_log, int s, int t) { int sum; double total, total_log, prob, prob_log, H = 0.0; if (s == t) return 0.0; total = (double) c[t-1]; total_log = log(total); prob = (double) c[s-1] / total; if (prob != 0) { prob_log = c_log[s-1] - total_log; H -= prob * prob_log; } sum = c[t-1] - c[s-1]; prob = (double) sum / total; if (sum != 0) { prob_log = log((double) sum) - total_log; H -= prob * prob_log; } return H; } /* * Returns the entropy induced by the points on the partition * , Q. See line 5 of Algorithm 2 in SOM. * * Parameters * cumhist : cumulative histogram matrix along P_map * cumhist_log : log(cumhist) * c : c_1, ..., c_p * q : number of rows of cumhist (number of partitions in Q_map) * p : number of cols of cumhist (number of partitions in P_map) * s : s in c_s * t : t in c_t */ double hp3q(int **cumhist, double **cumhist_log, int *c, int q, int p, int s, int t) { int i, sum; double total, total_log, prob, prob_log, H = 0.0; total = (double) c[t-1]; total_log = log(total); for (i=0; i * and Q. See line 13 of Algorithm 2, SOM. * * Parameters * cumhist : cumulative histogram matrix along P_map * c : c_1, ..., c_p * q : number of rows of cumhist (number of partitions * in Q_map) * p : number of cols of cumhist (number of partitions * in P_map) * s : s in c_s * t : t in c_t */ double hp2q(int **cumhist, int *c, int q, int p, int s, int t) { int i, sum; double total, total_log, prob, prob_log, H = 0.0 ; if (s == t) return 0.0; total = (double) (c[t-1] - c[s-1]); total_log = log(total); for (i=0; i {0, ...,q-1}. * See Algorithm 3 in SOM. * * Parameters * dy (IN): y-data sorted in increasing order * n (IN): number of elements of dy * y (IN): an integer greater than 1 * Q_map (OUT) : the map Q. Q_map must be a preallocated vector of * size n * q (OUT) : number of partitions in Q_map. q can be < y * * Returns * 0 */ int EquipartitionYAxis(double *dy, int n, int y, int *Q_map, int *q) { int i, j, s, h, curr; double temp1, temp2; double rowsize = (double) n / (double) y; i = 0; h = 0; curr = 0; while (i < n) { s = 1; for (j=i+1; j= temp2)) { ++curr; h = 0; temp1 = (double) n - (double) i; temp2 = (double) y - (double) curr; rowsize = temp1 / temp2; } for (j=0; j {0, ...,p-1}. * * Parameters * dx (IN) : x-data sorted in increasing order * n (IN) : number of elements of dx * Q_map (IN) : the map Q computed by EquipartitionYAxis sorted in * increasing order by dx-values * P_map (OUT) : the map P. P_map must be a preallocated vector * of size n * p (OUT) : number of partitions in P_map * * Returns * 0 on success, 1 if an error occurs */ int GetClumpsPartition(double *dx, int n, int *Q_map, int *P_map, int *p) { int i, j, flag, c, s; int *Q_tilde; i = 0; c = -1; Q_tilde = (int *) malloc (n * sizeof(int)); if (Q_tilde == NULL) return 1; memcpy(Q_tilde, Q_map, n*sizeof(int)); while (i < n) { s = 1; flag = FALSE; for (j=i+1; j 1) && (flag == TRUE)) { for (j=0; j {0, ...,p-1}. * * Parameters * dx (IN) : x-data sorted in increasing order * n (IN) : number of elements of dx * k_hat (IN) : maximum number of clumps * Q_map (IN) : the map Q computed by EquipartitionYAxis sorted in * increasing order by dx-values * P_map (OUT) : the map P. P_map must be a preallocated vector * of size n * p (OUT) : number of partitions in P_map * * Returns * 0 on success, 1 if an error occurs */ int GetSuperclumpsPartition(double *dx, int n, int k_hat, int *Q_map, int *P_map, int *p) { int i, ret; double *dp; /* clumps */ ret = GetClumpsPartition(dx, n, Q_map, P_map, p); if (ret) return 1; /* superclumps */ if (*p > k_hat) { dp = (double *) malloc (n * sizeof(double)); if (dp == NULL) return 1; for (i=0; i F_max) { I[t][2] = HQ + F; F_max = F; } } } /* * Inductively build the rest of the table of optimal partitions, * Algorithm 2 in SOM, lines 10-17 */ for (l=3; l<=x; l++) { for (t=l; t<=p; t++) { ct = (double) c[t-1]; F_max = -DBL_MAX; for (s=l-1; s<=t; s++) { cs = (double) c[s-1]; F = ((cs/ct) * (I[s][l-1]-HQ)) - (((ct-cs)/ct) * HP2Q[s][t]); if (F > F_max) { I[t][l] = HQ + F; F_max = F; } } } } /* Algorithm 2 in SOM, line 19 */ for (i=p+1; i<=x; i++) I[p][i] = I[p][p]; /* score */ for (i=2; i<=x; i++) score[i-2] = I[p][i] / MIN(log(i), log(q)); /* start frees */ for (i=0; i<=p; i++) free(HP2Q[i]); free(HP2Q); for (i=0; i<=p; i++) free(I[i]); free(I); for (i=0; ialpha > 0.0) && (param->alpha <= 1.0)) B = MAX(pow(prob->n, param->alpha), 4); else if (param->alpha >= 4) B = MIN(param->alpha, prob->n); else goto error_score; score = (mine_score *) malloc (sizeof(mine_score)); if (score == NULL) goto error_score; score->n = MAX((int) floor(B/2.0), 2) - 1; score->m = (int *) malloc(score->n * sizeof(int)); if (score->m == NULL) goto error_score_m; for (i=0; in; i++) score->m[i] = (int) floor((double) B / (double) (i+2)) - 1; score->M = (double **) malloc (score->n * sizeof(double *)); if (score->M == NULL) goto error_score_M; for (i=0; in; i++) { score->M[i] = (double *) malloc ((score->m[i]) * sizeof(double)); if (score->M[i] == NULL) { for (j=0; jM[j]); goto error_score_M_i; } } return score; error_score_M_i: free(score->M); error_score_M: free(score->m); error_score_m: free(score); error_score: return NULL; } /* See mine.h */ mine_score *mine_compute_score(mine_problem *prob, mine_parameter *param) { int i, j, k, p, q, ret; double *xx, *yy, *xy, *yx, *M_temp; int *ix, *iy; int *Q_map_temp, *Q_map, *P_map; mine_score *score; score = init_score(prob, param); if (score == NULL) goto error_score; xx = (double *) malloc (prob->n * sizeof(double)); if (xx == NULL) goto error_xx; yy = (double *) malloc (prob->n * sizeof(double)); if (yy == NULL) goto error_yy; xy = (double *) malloc (prob->n * sizeof(double)); if (xy == NULL) goto error_xy; yx = (double *) malloc (prob->n * sizeof(double)); if (yx == NULL) goto error_yx; Q_map_temp = (int *) malloc (prob->n * sizeof(int)); if (Q_map_temp == NULL) goto error_Q_temp; Q_map = (int *) malloc (prob->n * sizeof(int)); if (Q_map == NULL) goto error_Q; P_map = (int *) malloc (prob->n * sizeof(int)); if (P_map == NULL) goto error_P; ix = argsort(prob->x, prob->n); if (ix == NULL) goto error_ix; iy = argsort(prob->y, prob->n); if (iy == NULL) goto error_iy; M_temp = (double *)malloc ((score->m[0]) * sizeof(double)); if (M_temp == NULL) goto error_M_temp; /* build xx, yy, xy, yx */ for (i=0; in; i++) { xx[i] = prob->x[ix[i]]; yy[i] = prob->y[iy[i]]; xy[i] = prob->x[iy[i]]; yx[i] = prob->y[ix[i]]; } /* x vs. y */ for (i=0; in; i++) { k = MAX((int) (param->c * (score->m[i]+1)), 1); ret = EquipartitionYAxis(yy, prob->n, i+2, Q_map, &q); if (ret) goto error_0; /* sort Q by x */ for (j=0; jn; j++) Q_map_temp[iy[j]] = Q_map[j]; for (j=0; jn; j++) Q_map[j] = Q_map_temp[ix[j]]; ret = GetSuperclumpsPartition(xx, prob->n, k, Q_map, P_map, &p); if (ret) goto error_0; if (param->est == EST_MIC_APPROX) ret = OptimizeXAxis(xx, yx, prob->n, Q_map, q, P_map, p, score->m[i]+1, score->M[i]); else /* EST_MIC_E */ ret = OptimizeXAxis(xx, yx, prob->n, Q_map, q, P_map, p, MIN(i+2, score->m[i]+1), score->M[i]); if (ret) goto error_0; } /* y vs. x */ for (i=0; in; i++) { k = MAX((int) (param->c * (score->m[i]+1)), 1); ret = EquipartitionYAxis(xx, prob->n, i+2, Q_map, &q); if (ret) goto error_0; /* sort Q by y */ for (j=0; jn; j++) Q_map_temp[ix[j]] = Q_map[j]; for (j=0; jn; j++) Q_map[j] = Q_map_temp[iy[j]]; ret = GetSuperclumpsPartition(yy, prob->n, k, Q_map, P_map, &p); if (ret) goto error_0; if (param->est == EST_MIC_APPROX) ret = OptimizeXAxis(yy, xy, prob->n, Q_map, q, P_map, p, score->m[i]+1, M_temp); else /* EST_MIC_E */ ret = OptimizeXAxis(yy, xy, prob->n, Q_map, q, P_map, p, MIN(i+2, score->m[i]+1), M_temp); if (ret) goto error_0; if (param->est == EST_MIC_APPROX) for (j=0; jm[i]; j++) score->M[j][i] = MAX(M_temp[j], score->M[j][i]); else /* EST_MIC_E */ for (j=0; jm[i]); j++) score->M[j][i] = M_temp[j]; } free(M_temp); free(iy); free(ix); free(P_map); free(Q_map); free(Q_map_temp); free(yx); free(xy); free(yy); free(xx); return score; error_0: free(M_temp); error_M_temp: free(iy); error_iy: free(ix); error_ix: free(P_map); error_P: free(Q_map); error_Q: free(Q_map_temp); error_Q_temp: free(yx); error_yx: free(xy); error_xy: free(yy); error_yy: free(xx); error_xx: for (i=0; in; i++) free(score->M[i]); free(score->M); free(score->m); free(score); error_score: return NULL; } /* See mine.h */ char *mine_check_parameter(mine_parameter *param) { if (((param->alpha <= 0.0) || (param->alpha > 1.0)) && (param->alpha < 4.0)) return "alpha must be in (0.0, 1.0] or >= 4.0"; if (param->c <= 0.0) return "c must be > 0.0"; if ((param->est != EST_MIC_APPROX) && (param->est != EST_MIC_E)) return "unknown estimator"; return NULL; } /* See mine.h */ double mine_mic(mine_score *score) { int i, j; double score_max = 0.0; for (i=0; in; i++) for (j=0; jm[i]; j++) if (score->M[i][j] > score_max) score_max = score->M[i][j]; return score_max; } /* See mine.h */ double mine_mas(mine_score *score) { int i, j; double score_curr; double score_max = 0.0; for (i=0; in; i++) for (j=0; jm[i]; j++) { score_curr = fabs(score->M[i][j] - score->M[j][i]); if (score_curr > score_max) score_max = score_curr; } return score_max; } /* See mine.h */ double mine_mev(mine_score *score) { int i, j; double score_max = 0.0; for (i=0; in; i++) for (j=0; jm[i]; j++) if (((j==0) || (i==0)) && score->M[i][j] > score_max) score_max = score->M[i][j]; return score_max; } /* See mine.h */ double mine_mcn(mine_score *score, double eps) { int i, j; double log_xy; double score_min = DBL_MAX; double delta = 0.0001; /* avoids overestimation of mcn */ double mic = mine_mic(score); for (i=0; in; i++) for (j=0; jm[i]; j++) { log_xy = log((i+2) * (j+2)) / log(2.0); if (((score->M[i][j]+delta) >= ((1.0 - eps) * mic)) && (log_xy < score_min)) score_min = log_xy; } return score_min; } /* See mine.h */ double mine_mcn_general(mine_score *score) { int i, j; double log_xy; double score_min = DBL_MAX; double delta = 0.0001; /* avoids overestimation of mcn */ double mic = mine_mic(score); for (i=0; in; i++) for (j=0; jm[i]; j++) { log_xy = log((i+2) * (j+2)) / log(2.0); if (((score->M[i][j]+delta) >= (mic * mic)) && (log_xy < score_min)) score_min = log_xy; } return score_min; } /* See mine.h */ double mine_tic(mine_score *score, int norm) { int i, j, k=0; double tic = 0.0; for (i=0; in; i++) for (j=0; jm[i]; j++) { tic += score->M[i][j]; k++; } if (norm) tic /= k; return tic; } /* See mine.h */ double mine_gmic(mine_score *score, double p) { int i, j, k, Z, B; mine_score *score_sub, *C_star; double gmic; /* alloc score_sub */ score_sub = (mine_score *) malloc (sizeof(mine_score)); /* alloc C_star */ C_star = (mine_score *) malloc (sizeof(mine_score)); C_star->m = (int *) malloc(score->n * sizeof(int)); C_star->M = (double **) malloc (score->n * sizeof(double *)); for (i=0; in; i++) C_star->M[i] = (double *) malloc ((score->m[i]) * sizeof(double)); /* prepare score_sub */ score_sub->M = score->M; /* prepare C_star */ C_star->n = score->n; for (i=0; in; i++) C_star->m[i] = score->m[i]; /* compute C_star */ for (i=0; in; i++) for (j=0; jm[i]; j++) { B = (i+2) * (j+2); score_sub->n = MAX((int) floor(B/2.0), 2) - 1; score_sub->m = (int *) malloc(score_sub->n * sizeof(int)); for (k=0; kn; k++) score_sub->m[k] = (int) floor((double) B / (double) (k+2)) - 1; C_star->M[i][j] = mine_mic(score_sub); free(score_sub->m); } /* p=0 -> geometric mean */ if (p == 0.0) { Z = 0; gmic = 1.0; for (i=0; in; i++) for (j=0; jm[i]; j++) { gmic *= C_star->M[i][j]; Z++; } gmic = pow(gmic, (double) Z); } /* p!=0 -> generalized mean */ else { Z = 0; gmic = 0.0; for (i=0; in; i++) for (j=0; jm[i]; j++) { gmic += pow(C_star->M[i][j], p); Z++; } gmic /= (double) Z; gmic = pow(gmic, 1.0/p); } free(score_sub); if (C_star->n != 0) { free(C_star->m); for (i=0; in; i++) free(C_star->M[i]); free(C_star->M); } free(C_star); return gmic; } /* See mine.h */ void mine_free_score(mine_score **score) { int i; mine_score *score_ptr = *score; if (score_ptr != NULL) { if (score_ptr->n != 0) { free(score_ptr->m); for (i=0; in; i++) free(score_ptr->M[i]); free(score_ptr->M); } free(score_ptr); score_ptr = NULL; } } mine_pstats *mine_compute_pstats(mine_matrix *X, mine_parameter *param) { int i, j, k; mine_problem prob; mine_score *score; mine_pstats *stats; /* Allocate memory for stats */ stats = (mine_pstats *) malloc(sizeof(mine_pstats)); stats->n = (X->n * (X->n-1)) / 2; stats->mic = (double *) malloc(stats->n * sizeof(double)); stats->tic = (double *) malloc(stats->n * sizeof(double)); k = 0; prob.n = X->m; for (i=0; in-1; i++) { prob.x = &X->data[i*X->m]; for (j=i+1; jn; j++) { prob.y = &X->data[j*X->m]; score = mine_compute_score(&prob, param); stats->mic[k] = mine_mic(score); stats->tic[k] = mine_tic(score, TRUE); mine_free_score(&score); k++; } } return stats; } mine_cstats *mine_compute_cstats(mine_matrix *X, mine_matrix *Y, mine_parameter *param) { int i, j, k; mine_cstats *stats; mine_problem prob; mine_score *score; if (X->m != Y->m) return NULL; /* Allocate memory for stats */ stats = (mine_cstats *) malloc(sizeof(mine_cstats)); stats->n = X->n; stats->m = Y->n; stats->mic = (double *) malloc((stats->n * stats->m) * sizeof(double)); stats->tic = (double *) malloc((stats->n * stats->m) * sizeof(double)); k = 0; prob.n = X->m; for (i=0; in; i++) { prob.x = &X->data[i*X->m]; for (j=0; jn; j++) { prob.y = &Y->data[j*Y->m]; score = mine_compute_score(&prob, param); stats->mic[k] = mine_mic(score); stats->tic[k] = mine_tic(score, TRUE); mine_free_score(&score); k++; } } return stats; }