/* vim:set ts=4 sw=4 sts=4 et: */ /* plfit.c * * Copyright (C) 2010-2011 Tamas Nepusz * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #include #include "plfit_error.h" #include "gss.h" #include "lbfgs.h" #include "platform.h" #include "plfit.h" #include "kolmogorov.h" #include "plfit_sampling.h" #include "hzeta.h" /* #define PLFIT_DEBUG */ #define DATA_POINTS_CHECK \ if (n <= 0) { \ PLFIT_ERROR("no data points", PLFIT_EINVAL); \ } #define XMIN_CHECK_ZERO \ if (xmin <= 0) { \ PLFIT_ERROR("xmin must be greater than zero", PLFIT_EINVAL); \ } #define XMIN_CHECK_ONE \ if (xmin < 1) { \ PLFIT_ERROR("xmin must be at least 1", PLFIT_EINVAL); \ } static int plfit_i_resample_continuous(double* xs_head, size_t num_smaller, size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng, double* result); static int plfit_i_resample_discrete(double* xs_head, size_t num_smaller, size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng, double* result); static int double_comparator(const void *a, const void *b) { const double *da = (const double*)a; const double *db = (const double*)b; return (*da > *db) - (*da < *db); } static int plfit_i_copy_and_sort(double* xs, size_t n, double** result) { *result = (double*)malloc(sizeof(double) * n); if (*result == 0) { PLFIT_ERROR("cannot create sorted copy of input data", PLFIT_ENOMEM); } memcpy(*result, xs, sizeof(double) * n); qsort(*result, n, sizeof(double), double_comparator); return PLFIT_SUCCESS; } /** * Given an unsorted array of doubles, counts how many elements there are that * are smaller than a given value. * * \param begin pointer to the beginning of the array * \param end pointer to the first element after the end of the array * \param xmin the threshold value * * \return the nubmer of elements in the array that are smaller than the given * value. */ static size_t count_smaller(double* begin, double* end, double xmin) { double* p; size_t counter = 0; for (p = begin; p < end; p++) { if (*p < xmin) { counter++; } } return counter; } /** * Given an unsorted array of doubles, return another array that contains the * elements that are smaller than a given value * * \param begin pointer to the beginning of the array * \param end pointer to the first element after the end of the array * \param xmin the threshold value * \param result_length if not \c NULL, the number of unique elements in the * given array is returned here * * \return pointer to the head of the new array or 0 if there is not enough * memory */ static double* extract_smaller(double* begin, double* end, double xmin, size_t* result_length) { size_t counter = count_smaller(begin, end, xmin); double *p, *result; result = calloc(counter, sizeof(double)); if (result == 0) return 0; for (p = result; begin < end; begin++) { if (*begin < xmin) { *p = *begin; p++; } } if (result_length) { *result_length = counter; } return result; } /** * Given a sorted array of doubles, return another array that contains pointers * into the array for the start of each block of identical elements. * * \param begin pointer to the beginning of the array * \param end pointer to the first element after the end of the array * \param result_length if not \c NULL, the number of unique elements in the * given array is returned here. It is left unchanged if * the function returns with an error. * * \return pointer to the head of the new array or 0 if there is not enough * memory */ static double** unique_element_pointers(double* begin, double* end, size_t* result_length) { double* ptr = begin; double** result; double prev_x; size_t num_elts = 15; size_t used_elts = 0; /* Special case: empty array */ if (begin == end) { result = calloc(1, sizeof(double*)); if (result != 0) { result[0] = 0; if (result_length != 0) { *result_length = 0; } } return result; } /* Allocate initial result array, including the guard element */ result = calloc(num_elts+1, sizeof(double*)); if (result == 0) return 0; prev_x = *begin; result[used_elts++] = begin; /* Process the input array */ for (ptr = begin+1; ptr < end; ptr++) { if (*ptr == prev_x) continue; /* New block found */ if (used_elts >= num_elts) { /* Array full; allocate a new chunk */ num_elts = num_elts*2 + 1; result = realloc(result, sizeof(double*) * (num_elts+1)); if (result == 0) return 0; } /* Store the new element */ result[used_elts++] = ptr; prev_x = *ptr; } /* Calculate the result length */ if (result_length != 0) { *result_length = used_elts; } /* Add the guard entry to the end of the result */ result[used_elts++] = 0; return result; } static void plfit_i_perform_finite_size_correction(plfit_result_t* result, size_t n) { result->alpha = result->alpha * (n-1) / n + 1.0 / n; } /********** Continuous power law distribution fitting **********/ static void plfit_i_logsum_less_than_continuous(double* begin, double* end, double xmin, double* result, size_t* m) { double logsum = 0.0; size_t count = 0; for (; begin != end; begin++) { if (*begin >= xmin) { count++; logsum += log(*begin / xmin); } } *m = count; *result = logsum; } static double plfit_i_logsum_continuous(double* begin, double* end, double xmin) { double logsum = 0.0; for (; begin != end; begin++) logsum += log(*begin / xmin); return logsum; } static int plfit_i_estimate_alpha_continuous(double* xs, size_t n, double xmin, double* alpha) { double result; size_t m; XMIN_CHECK_ZERO; plfit_i_logsum_less_than_continuous(xs, xs+n, xmin, &result, &m); if (m == 0) { PLFIT_ERROR("no data point was larger than xmin", PLFIT_EINVAL); } *alpha = 1 + m / result; return PLFIT_SUCCESS; } static int plfit_i_estimate_alpha_continuous_sorted(double* xs, size_t n, double xmin, double* alpha) { double* end = xs+n; XMIN_CHECK_ZERO; for (; xs != end && *xs < xmin; xs++); if (xs == end) { PLFIT_ERROR("no data point was larger than xmin", PLFIT_EINVAL); } *alpha = 1 + (end-xs) / plfit_i_logsum_continuous(xs, end, xmin); return PLFIT_SUCCESS; } static int plfit_i_ks_test_continuous(double* xs, double* xs_end, const double alpha, const double xmin, double* D) { /* Assumption: xs is sorted and cut off at xmin so the first element is * always larger than or equal to xmin. */ double result = 0, n; int m = 0; n = xs_end - xs; while (xs < xs_end) { double d = fabs(1-pow(xmin / *xs, alpha-1) - m / n); if (d > result) result = d; xs++; m++; } *D = result; return PLFIT_SUCCESS; } static int plfit_i_calculate_p_value_continuous(double* xs, size_t n, const plfit_continuous_options_t *options, plfit_bool_t xmin_fixed, plfit_result_t *result) { long int num_trials; long int successes = 0; double *xs_head; size_t num_smaller; plfit_continuous_options_t options_no_p_value = *options; int retval = PLFIT_SUCCESS; if (options->p_value_method == PLFIT_P_VALUE_SKIP) { result->p = NAN; return PLFIT_SUCCESS; } if (options->p_value_method == PLFIT_P_VALUE_APPROXIMATE) { num_smaller = count_smaller(xs, xs + n, result->xmin); result->p = plfit_ks_test_one_sample_p(result->D, n - num_smaller); return PLFIT_SUCCESS; } options_no_p_value.p_value_method = PLFIT_P_VALUE_SKIP; num_trials = (long int)(0.25 / options->p_value_precision / options->p_value_precision); if (num_trials <= 0) { PLFIT_ERROR("invalid p-value precision", PLFIT_EINVAL); } /* Extract the head of xs that contains elements smaller than xmin */ xs_head = extract_smaller(xs, xs+n, result->xmin, &num_smaller); if (xs_head == 0) PLFIT_ERROR("cannot calculate exact p-value", PLFIT_ENOMEM); #ifdef _OPENMP #pragma omp parallel #endif { /* Parallel section starts here. If we are compiling using OpenMP, each * thread will use its own RNG that is seeded from the master RNG. If * we are compiling without OpenMP, there is only one thread and it uses * the master RNG. This section must be critical to ensure that only one * thread is using the master RNG at the same time. */ #ifdef _OPENMP plfit_mt_rng_t private_rng; #endif plfit_mt_rng_t *p_rng; double *ys; long int i; plfit_result_t result_synthetic; #ifdef _OPENMP #pragma omp critical { p_rng = &private_rng; plfit_mt_init_from_rng(p_rng, options->rng); } #else p_rng = options->rng; #endif /* Allocate memory to sample into */ ys = calloc(n, sizeof(double)); if (ys == 0) { retval = PLFIT_ENOMEM; } else { /* The main for loop starts here. */ #ifdef _OPENMP #pragma omp for reduction(+:successes) #endif for (i = 0; i < num_trials; i++) { plfit_i_resample_continuous(xs_head, num_smaller, n, result->alpha, result->xmin, n, p_rng, ys); if (xmin_fixed) { plfit_estimate_alpha_continuous(ys, n, result->xmin, &options_no_p_value, &result_synthetic); } else { plfit_continuous(ys, n, &options_no_p_value, &result_synthetic); } if (result_synthetic.D > result->D) successes++; } free(ys); } /* End of parallelized part */ } free(xs_head); if (retval == PLFIT_SUCCESS) { result->p = successes / ((double)num_trials); } else { PLFIT_ERROR("cannot calculate exact p-value", retval); } return retval; } int plfit_log_likelihood_continuous(double* xs, size_t n, double alpha, double xmin, double* L) { double logsum, c; size_t m; if (alpha <= 1) { PLFIT_ERROR("alpha must be greater than one", PLFIT_EINVAL); } XMIN_CHECK_ZERO; c = (alpha - 1) / xmin; plfit_i_logsum_less_than_continuous(xs, xs+n, xmin, &logsum, &m); *L = -alpha * logsum + log(c) * m; return PLFIT_SUCCESS; } int plfit_estimate_alpha_continuous_sorted(double* xs, size_t n, double xmin, const plfit_continuous_options_t* options, plfit_result_t *result) { double *begin, *end; if (!options) options = &plfit_continuous_default_options; begin = xs; end = xs + n; while (begin < end && *begin < xmin) begin++; PLFIT_CHECK(plfit_i_estimate_alpha_continuous_sorted(begin, end-begin, xmin, &result->alpha)); PLFIT_CHECK(plfit_i_ks_test_continuous(begin, end, result->alpha, xmin, &result->D)); if (options->finite_size_correction) plfit_i_perform_finite_size_correction(result, end-begin); result->xmin = xmin; PLFIT_CHECK(plfit_log_likelihood_continuous(begin, end-begin, result->alpha, result->xmin, &result->L)); PLFIT_CHECK(plfit_i_calculate_p_value_continuous(xs, n, options, 1, result)); return PLFIT_SUCCESS; } int plfit_estimate_alpha_continuous(double* xs, size_t n, double xmin, const plfit_continuous_options_t* options, plfit_result_t *result) { double *xs_copy; if (!options) options = &plfit_continuous_default_options; PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy)); PLFIT_CHECK(plfit_estimate_alpha_continuous_sorted(xs_copy, n, xmin, options, result)); free(xs_copy); return PLFIT_SUCCESS; } typedef struct { double *begin; /**< Pointer to the beginning of the array holding the data */ double *end; /**< Pointer to after the end of the array holding the data */ double **probes; /**< Pointers to the elements of the array that will be probed */ size_t num_probes; /**< Number of probes */ plfit_result_t last; /**< Result of the last evaluation */ } plfit_continuous_xmin_opt_data_t; static double plfit_i_continuous_xmin_opt_evaluate(void* instance, double x) { plfit_continuous_xmin_opt_data_t* data = (plfit_continuous_xmin_opt_data_t*)instance; double* begin = data->probes[(long int)x]; data->last.xmin = *begin; #ifdef PLFIT_DEBUG printf("Trying with probes[%ld] = %.4f\n", (long int)x, *begin); #endif plfit_i_estimate_alpha_continuous_sorted(begin, data->end-begin, *begin, &data->last.alpha); plfit_i_ks_test_continuous(begin, data->end, data->last.alpha, *begin, &data->last.D); return data->last.D; } static int plfit_i_continuous_xmin_opt_progress(void* instance, double x, double fx, double min, double fmin, double left, double right, int k) { #ifdef PLFIT_DEBUG printf("Iteration #%d: [%.4f; %.4f), x=%.4f, fx=%.4f, min=%.4f, fmin=%.4f\n", k, left, right, x, fx, min, fmin); #endif /* Continue only if `left' and `right' point to different integers */ return (int)left == (int)right; } static int plfit_i_continuous_xmin_opt_linear_scan( plfit_continuous_xmin_opt_data_t* opt_data, plfit_result_t* best_result, size_t* best_n) { /* this must be signed because OpenMP with Windows MSVC needs signed for * loop index variables. ssize_t will not work because that is a POSIX * extension */ ptrdiff_t i = 0; /* initialize to work around incorrect warning issued by Clang 9.0 */ plfit_result_t global_best_result; size_t global_best_n; /* Prepare some variables */ global_best_n = 0; global_best_result.D = DBL_MAX; global_best_result.xmin = 0; global_best_result.alpha = 0; /* Due to the OpenMP parallelization, we do things as follows. Each * OpenMP thread will search for the best D-score on its own and store * the result in a private local_best_result variable. The end of the * parallel block contains a critical section that threads will enter * one by one and compare their private local_best_result with a * global_best that is shared among the threads. */ #ifdef _OPENMP #pragma omp parallel shared(global_best_result, global_best_n) private(i) firstprivate(opt_data) #endif { /* These variables are private since they are declared within the * parallel block */ plfit_result_t local_best_result; plfit_continuous_xmin_opt_data_t local_opt_data = *opt_data; size_t local_best_n; /* Initialize the local_best_result and local_best_n variables */ local_best_n = 0; local_best_result.D = DBL_MAX; local_best_result.xmin = 0; local_best_result.alpha = 0; local_best_result.p = NAN; local_best_result.L = NAN; /* The range of the for loop below is divided among the threads. * nowait means that there will be no implicit barrier at the end * of the loop so threads that get there earlier can enter the * critical section without waiting for the others */ #ifdef _OPENMP #pragma omp for nowait schedule(dynamic,10) #endif for (i = 0; i < local_opt_data.num_probes-1; i++) { plfit_i_continuous_xmin_opt_evaluate(&local_opt_data, i); if (local_opt_data.last.D < local_best_result.D) { #ifdef PLFIT_DEBUG printf("Found new local best at %g with D=%g\n", local_opt_data.last.xmin, local_opt_data.last.D); #endif local_best_result = local_opt_data.last; local_best_n = local_opt_data.end - local_opt_data.probes[i] + 1; } } /* Critical section that finds the global best result from the * local ones collected by each thread */ #ifdef _OPENMP #pragma omp critical #endif if (local_best_result.D < global_best_result.D) { global_best_result = local_best_result; global_best_n = local_best_n; #ifdef PLFIT_DEBUG printf("Found new global best at %g with D=%g\n", global_best_result.xmin, global_best_result.D); #endif } } *best_result = global_best_result; *best_n = global_best_n; #ifdef PLFIT_DEBUG printf("Returning global best: %g\n", best_result->xmin); #endif return PLFIT_SUCCESS; } int plfit_continuous(double* xs, size_t n, const plfit_continuous_options_t* options, plfit_result_t* result) { gss_parameter_t gss_param; plfit_continuous_xmin_opt_data_t opt_data; plfit_result_t best_result = { /* alpha = */ NAN, /* xmin = */ NAN, /* L = */ NAN, /* D = */ NAN, /* p = */ NAN }; int success; size_t i, best_n, num_uniques = 0; double x, *px, **uniques; DATA_POINTS_CHECK; /* Sane defaults */ best_n = n; if (!options) options = &plfit_continuous_default_options; /* Make a copy of xs and sort it */ PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &opt_data.begin)); opt_data.end = opt_data.begin + n; /* Create an array containing pointers to the unique elements of the input. From * each block of unique elements, we add the pointer to the first one. */ uniques = unique_element_pointers(opt_data.begin, opt_data.end, &num_uniques); if (uniques == 0) PLFIT_ERROR("cannot fit continuous power-law", PLFIT_ENOMEM); /* We will now determine the best xmin that yields the lowest D-score. The * 'success' variable will denote whether the search procedure we tried was * successful. If it is false after having exhausted all options, we fall * back to a linear search. */ success = 0; switch (options->xmin_method) { case PLFIT_GSS_OR_LINEAR: /* Try golden section search first. */ if (num_uniques > 5) { opt_data.probes = uniques; opt_data.num_probes = num_uniques; gss_parameter_init(&gss_param); success = (gss(0, opt_data.num_probes-5, &x, 0, plfit_i_continuous_xmin_opt_evaluate, plfit_i_continuous_xmin_opt_progress, &opt_data, &gss_param) == 0); if (success) { px = opt_data.probes[(int)x]; best_n = opt_data.end-px+1; best_result = opt_data.last; } } break; case PLFIT_STRATIFIED_SAMPLING: if (num_uniques >= 50) { /* Try stratified sampling to narrow down the interval where the minimum * is likely to reside. We check 10% of the unique items, distributed * evenly, find the one with the lowest D-score, and then check the * area around it more thoroughly. */ const size_t subdivision_length = 10; size_t num_strata = num_uniques / subdivision_length; double **strata = calloc(num_strata, sizeof(double*)); int error_code; for (i = 0; i < num_strata; i++) { strata[i] = uniques[i * subdivision_length]; } opt_data.probes = strata; opt_data.num_probes = num_strata; error_code = plfit_i_continuous_xmin_opt_linear_scan(&opt_data, &best_result, &best_n); if (error_code != PLFIT_SUCCESS) { free(strata); return error_code; } opt_data.num_probes = 0; for (i = 0; i < num_strata; i++) { if (*strata[i] == best_result.xmin) { /* Okay, scan more thoroughly from strata[i-1] to strata[i+1], * which is from uniques[(i-1)*subdivision_length] to * uniques[(i+1)*subdivision_length */ opt_data.probes = uniques + (i > 0 ? (i-1)*subdivision_length : 0); opt_data.num_probes = 0; if (i != 0) opt_data.num_probes += subdivision_length; if (i != num_strata-1) opt_data.num_probes += subdivision_length; break; } } free(strata); if (opt_data.num_probes > 0) { /* Do a strict linear scan in the subrange determined above */ PLFIT_CHECK( plfit_i_continuous_xmin_opt_linear_scan( &opt_data, &best_result, &best_n ) ); success = 1; } else { /* This should not happen, but we handle it anyway */ success = 0; } } break; default: /* Just use the linear search */ break; } if (!success) { /* More advanced search methods failed or were skipped; try linear search */ opt_data.probes = uniques; opt_data.num_probes = num_uniques; PLFIT_CHECK(plfit_i_continuous_xmin_opt_linear_scan(&opt_data, &best_result, &best_n)); success = 1; } /* Get rid of the uniques array, we don't need it any more */ free(uniques); /* Sort out the result */ *result = best_result; if (options->finite_size_correction) plfit_i_perform_finite_size_correction(result, best_n); PLFIT_CHECK(plfit_log_likelihood_continuous(opt_data.begin + n - best_n, best_n, result->alpha, result->xmin, &result->L)); PLFIT_CHECK(plfit_i_calculate_p_value_continuous(opt_data.begin, n, options, 0, result)); /* Get rid of the copied data as well */ free(opt_data.begin); return PLFIT_SUCCESS; } /********** Discrete power law distribution fitting **********/ typedef struct { size_t m; double logsum; double xmin; } plfit_i_estimate_alpha_discrete_data_t; static double plfit_i_logsum_discrete(double* begin, double* end, double xmin) { double logsum = 0.0; for (; begin != end; begin++) logsum += log(*begin); return logsum; } static void plfit_i_logsum_less_than_discrete(double* begin, double* end, double xmin, double* logsum, size_t* m) { double result = 0.0; size_t count = 0; for (; begin != end; begin++) { if (*begin < xmin) continue; result += log(*begin); count++; } *logsum = result; *m = count; } static lbfgsfloatval_t plfit_i_estimate_alpha_discrete_lbfgs_evaluate( void* instance, const lbfgsfloatval_t* x, lbfgsfloatval_t* g, const int n, const lbfgsfloatval_t step) { plfit_i_estimate_alpha_discrete_data_t* data; lbfgsfloatval_t result; double dx = step; double huge = 1e10; /* pseudo-infinity; apparently DBL_MAX does not work */ double lnhzeta_x=NAN; double lnhzeta_deriv_x=NAN; data = (plfit_i_estimate_alpha_discrete_data_t*)instance; #ifdef PLFIT_DEBUG printf("- Evaluating at %.4f (step = %.4f, xmin = %.4f)\n", *x, step, data->xmin); #endif if (isnan(*x)) { g[0] = huge; return huge; } /* Find the delta X value to estimate the gradient */ if (dx > 0.001 || dx == 0) dx = 0.001; else if (dx < -0.001) dx = -0.001; /* Is x[0] in its valid range? */ if (x[0] <= 1.0) { /* The Hurwitz zeta function is infinite in this case */ g[0] = (dx > 0) ? -huge : huge; return huge; } if (x[0] + dx <= 1.0) { g[0] = huge; result = x[0] * data->logsum + data->m * hsl_sf_lnhzeta(x[0], data->xmin); } else { hsl_sf_lnhzeta_deriv_tuple(x[0], data->xmin, &lnhzeta_x, &lnhzeta_deriv_x); g[0] = data->logsum + data->m * lnhzeta_deriv_x; result = x[0] * data->logsum + data->m * lnhzeta_x; } #ifdef PLFIT_DEBUG printf(" - Gradient: %.4f\n", g[0]); printf(" - Result: %.4f\n", result); #endif return result; } static int plfit_i_estimate_alpha_discrete_lbfgs_progress(void* instance, const lbfgsfloatval_t* x, const lbfgsfloatval_t* g, const lbfgsfloatval_t fx, const lbfgsfloatval_t xnorm, const lbfgsfloatval_t gnorm, const lbfgsfloatval_t step, int n, int k, int ls) { return 0; } static int plfit_i_estimate_alpha_discrete_linear_scan(double* xs, size_t n, double xmin, double* alpha, const plfit_discrete_options_t* options, plfit_bool_t sorted) { double curr_alpha, best_alpha, L, L_max; double logsum; size_t m; XMIN_CHECK_ONE; if (options->alpha.min <= 1.0) { PLFIT_ERROR("alpha.min must be greater than 1.0", PLFIT_EINVAL); } if (options->alpha.max < options->alpha.min) { PLFIT_ERROR("alpha.max must be greater than alpha.min", PLFIT_EINVAL); } if (options->alpha.step <= 0) { PLFIT_ERROR("alpha.step must be positive", PLFIT_EINVAL); } if (sorted) { logsum = plfit_i_logsum_discrete(xs, xs+n, xmin); m = n; } else { plfit_i_logsum_less_than_discrete(xs, xs+n, xmin, &logsum, &m); } best_alpha = options->alpha.min; L_max = -DBL_MAX; for (curr_alpha = options->alpha.min; curr_alpha <= options->alpha.max; curr_alpha += options->alpha.step) { L = -curr_alpha * logsum - m * hsl_sf_lnhzeta(curr_alpha, xmin); if (L > L_max) { L_max = L; best_alpha = curr_alpha; } } *alpha = best_alpha; return PLFIT_SUCCESS; } static int plfit_i_estimate_alpha_discrete_lbfgs(double* xs, size_t n, double xmin, double* alpha, const plfit_discrete_options_t* options, plfit_bool_t sorted) { lbfgs_parameter_t param; lbfgsfloatval_t* variables; plfit_i_estimate_alpha_discrete_data_t data; int ret; XMIN_CHECK_ONE; /* Initialize algorithm parameters */ lbfgs_parameter_init(¶m); param.max_iterations = 0; /* proceed until infinity */ /* Set up context for optimization */ data.xmin = xmin; if (sorted) { data.logsum = plfit_i_logsum_discrete(xs, xs+n, xmin); data.m = n; } else { plfit_i_logsum_less_than_discrete(xs, xs+n, xmin, &data.logsum, &data.m); } /* Allocate space for the single alpha variable */ variables = lbfgs_malloc(1); variables[0] = 3.0; /* initial guess */ /* Optimization */ ret = lbfgs(1, variables, /* ptr_fx = */ 0, plfit_i_estimate_alpha_discrete_lbfgs_evaluate, plfit_i_estimate_alpha_discrete_lbfgs_progress, &data, ¶m); if (ret < 0 && ret != LBFGSERR_ROUNDING_ERROR && ret != LBFGSERR_MAXIMUMLINESEARCH && ret != LBFGSERR_MINIMUMSTEP && ret != LBFGSERR_CANCELED) { char buf[4096]; snprintf(buf, 4096, "L-BFGS optimization signaled an error (error code = %d)", ret); lbfgs_free(variables); PLFIT_ERROR(buf, PLFIT_FAILURE); } *alpha = variables[0]; /* Deallocate the variable array */ lbfgs_free(variables); return PLFIT_SUCCESS; } static int plfit_i_estimate_alpha_discrete_fast(double* xs, size_t n, double xmin, double* alpha, const plfit_discrete_options_t* options, plfit_bool_t sorted) { plfit_continuous_options_t cont_options; if (!options) options = &plfit_discrete_default_options; plfit_continuous_options_init(&cont_options); cont_options.finite_size_correction = options->finite_size_correction; XMIN_CHECK_ONE; if (sorted) { return plfit_i_estimate_alpha_continuous_sorted(xs, n, xmin-0.5, alpha); } else { return plfit_i_estimate_alpha_continuous(xs, n, xmin-0.5, alpha); } } static int plfit_i_estimate_alpha_discrete(double* xs, size_t n, double xmin, double* alpha, const plfit_discrete_options_t* options, plfit_bool_t sorted) { switch (options->alpha_method) { case PLFIT_LBFGS: PLFIT_CHECK(plfit_i_estimate_alpha_discrete_lbfgs(xs, n, xmin, alpha, options, sorted)); break; case PLFIT_LINEAR_SCAN: PLFIT_CHECK(plfit_i_estimate_alpha_discrete_linear_scan(xs, n, xmin, alpha, options, sorted)); break; case PLFIT_PRETEND_CONTINUOUS: PLFIT_CHECK(plfit_i_estimate_alpha_discrete_fast(xs, n, xmin, alpha, options, sorted)); break; default: PLFIT_ERROR("unknown optimization method specified", PLFIT_EINVAL); } return PLFIT_SUCCESS; } static int plfit_i_ks_test_discrete(double* xs, double* xs_end, const double alpha, const double xmin, double* D) { /* Assumption: xs is sorted and cut off at xmin so the first element is * always larger than or equal to xmin. */ double result = 0, n, lnhzeta, x; int m = 0; n = xs_end - xs; lnhzeta = hsl_sf_lnhzeta(alpha, xmin); while (xs < xs_end) { double d; x = *xs; /* Re the next line: this used to be the following: * * fabs( 1 - hzeta(alpha, x) / hzeta(alpha, xmin) - m / n) * * However, using the Hurwitz zeta directly sometimes yields * underflows (see Github pull request #17 and related issues). * hzeta(alpha, x) / hzeta(alpha, xmin) can be replaced with * exp(lnhzeta(alpha, x) - lnhzeta(alpha, xmin)), but then * we have 1 - exp(something), which is better to calculate * with a dedicated expm1() function. */ d = fabs( expm1( hsl_sf_lnhzeta(alpha, x) - lnhzeta ) + m / n); if (d > result) result = d; do { xs++; m++; } while (xs < xs_end && *xs == x); } *D = result; return PLFIT_SUCCESS; } static int plfit_i_calculate_p_value_discrete(double* xs, size_t n, const plfit_discrete_options_t* options, plfit_bool_t xmin_fixed, plfit_result_t *result) { long int num_trials; long int successes = 0; double *xs_head; size_t num_smaller; plfit_discrete_options_t options_no_p_value = *options; int retval = PLFIT_SUCCESS; if (options->p_value_method == PLFIT_P_VALUE_SKIP) { /* skipping p-value calculation */ result->p = NAN; return PLFIT_SUCCESS; } if (options->p_value_method == PLFIT_P_VALUE_APPROXIMATE) { /* p-value approximation; most likely an upper bound */ num_smaller = count_smaller(xs, xs + n, result->xmin); result->p = plfit_ks_test_one_sample_p(result->D, n - num_smaller); return PLFIT_SUCCESS; } options_no_p_value.p_value_method = PLFIT_P_VALUE_SKIP; num_trials = (long int)(0.25 / options->p_value_precision / options->p_value_precision); if (num_trials <= 0) { PLFIT_ERROR("invalid p-value precision", PLFIT_EINVAL); } /* Extract the head of xs that contains elements smaller than xmin */ xs_head = extract_smaller(xs, xs+n, result->xmin, &num_smaller); if (xs_head == 0) PLFIT_ERROR("cannot calculate exact p-value", PLFIT_ENOMEM); #ifdef _OPENMP #pragma omp parallel #endif { /* Parallel section starts here. If we are compiling using OpenMP, each * thread will use its own RNG that is seeded from the master RNG. If * we are compiling without OpenMP, there is only one thread and it uses * the master RNG. This section must be critical to ensure that only one * thread is using the master RNG at the same time. */ #ifdef _OPENMP plfit_mt_rng_t private_rng; #endif plfit_mt_rng_t *p_rng; double *ys; long int i; plfit_result_t result_synthetic; #ifdef _OPENMP #pragma omp critical { p_rng = &private_rng; plfit_mt_init_from_rng(p_rng, options->rng); } #else p_rng = options->rng; #endif /* Allocate memory to sample into */ ys = calloc(n, sizeof(double)); if (ys == 0) { retval = PLFIT_ENOMEM; } else { /* The main for loop starts here. */ #ifdef _OPENMP #pragma omp for reduction(+:successes) #endif for (i = 0; i < num_trials; i++) { plfit_i_resample_discrete(xs_head, num_smaller, n, result->alpha, result->xmin, n, p_rng, ys); if (xmin_fixed) { plfit_estimate_alpha_discrete(ys, n, result->xmin, &options_no_p_value, &result_synthetic); } else { plfit_discrete(ys, n, &options_no_p_value, &result_synthetic); } if (result_synthetic.D > result->D) successes++; } free(ys); } /* End of parallelized part */ } free(xs_head); if (retval == PLFIT_SUCCESS) { result->p = successes / ((double)num_trials); } else { PLFIT_ERROR("cannot calculate exact p-value", retval); } return retval; } int plfit_log_likelihood_discrete(double* xs, size_t n, double alpha, double xmin, double* L) { double result; size_t m; if (alpha <= 1) { PLFIT_ERROR("alpha must be greater than one", PLFIT_EINVAL); } XMIN_CHECK_ONE; plfit_i_logsum_less_than_discrete(xs, xs+n, xmin, &result, &m); result = - alpha * result - m * hsl_sf_lnhzeta(alpha, xmin); *L = result; return PLFIT_SUCCESS; } int plfit_estimate_alpha_discrete(double* xs, size_t n, double xmin, const plfit_discrete_options_t* options, plfit_result_t *result) { double *xs_copy, *begin, *end; if (!options) options = &plfit_discrete_default_options; /* Check the validity of the input parameters */ DATA_POINTS_CHECK; if (options->alpha_method == PLFIT_LINEAR_SCAN) { if (options->alpha.min <= 1.0) { PLFIT_ERROR("alpha.min must be greater than 1.0", PLFIT_EINVAL); } if (options->alpha.max < options->alpha.min) { PLFIT_ERROR("alpha.max must be greater than alpha.min", PLFIT_EINVAL); } if (options->alpha.step <= 0) { PLFIT_ERROR("alpha.step must be positive", PLFIT_EINVAL); } } PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy)); begin = xs_copy; end = xs_copy + n; while (begin < end && *begin < xmin) begin++; PLFIT_CHECK(plfit_i_estimate_alpha_discrete(begin, end-begin, xmin, &result->alpha, options, /* sorted = */ 1)); PLFIT_CHECK(plfit_i_ks_test_discrete(begin, end, result->alpha, xmin, &result->D)); result->xmin = xmin; if (options->finite_size_correction) plfit_i_perform_finite_size_correction(result, end-begin); PLFIT_CHECK(plfit_log_likelihood_discrete(begin, end-begin, result->alpha, result->xmin, &result->L)); PLFIT_CHECK(plfit_i_calculate_p_value_discrete(xs, n, options, 1, result)); free(xs_copy); return PLFIT_SUCCESS; } int plfit_discrete(double* xs, size_t n, const plfit_discrete_options_t* options, plfit_result_t* result) { double curr_D, curr_alpha; plfit_result_t best_result; double *xs_copy, *px, *end, *end_xmin, prev_x; size_t best_n; int m; if (!options) options = &plfit_discrete_default_options; /* Check the validity of the input parameters */ DATA_POINTS_CHECK; if (options->alpha_method == PLFIT_LINEAR_SCAN) { if (options->alpha.min <= 1.0) { PLFIT_ERROR("alpha.min must be greater than 1.0", PLFIT_EINVAL); } if (options->alpha.max < options->alpha.min) { PLFIT_ERROR("alpha.max must be greater than alpha.min", PLFIT_EINVAL); } if (options->alpha.step <= 0) { PLFIT_ERROR("alpha.step must be positive", PLFIT_EINVAL); } } PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy)); best_result.D = DBL_MAX; best_result.xmin = 1; best_result.alpha = 1; best_n = 0; /* Skip initial values from xs_copy until we get to a positive element or * until we reach the end of the array */ px = xs_copy; end = px + n; end_xmin = end - 1; while (px < end && *px < 1) { px++; } /* Make sure there are at least three distinct values if possible */ m = px - xs_copy; prev_x = *end_xmin; while (end_xmin > px && *end_xmin == prev_x) { end_xmin--; } prev_x = *end_xmin; while (end_xmin > px && *end_xmin == prev_x) { end_xmin--; } prev_x = 0; end_xmin++; while (px < end_xmin) { while (px < end_xmin && *px == prev_x) { px++; m++; } PLFIT_CHECK( plfit_i_estimate_alpha_discrete( px, n-m, *px, &curr_alpha, options, /* sorted = */ 1 ) ); PLFIT_CHECK( plfit_i_ks_test_discrete(px, end, curr_alpha, *px, &curr_D) ); if (curr_D < best_result.D) { best_result.alpha = curr_alpha; best_result.xmin = *px; best_result.D = curr_D; best_n = n-m; } prev_x = *px; px++; m++; } *result = best_result; if (options->finite_size_correction) plfit_i_perform_finite_size_correction(result, best_n); PLFIT_CHECK(plfit_log_likelihood_discrete(xs_copy+(n-best_n), best_n, result->alpha, result->xmin, &result->L)); PLFIT_CHECK(plfit_i_calculate_p_value_discrete(xs_copy, n, options, 0, result)); free(xs_copy); return PLFIT_SUCCESS; } /***** resampling routines to generate synthetic replicates ****/ static int plfit_i_resample_continuous(double* xs_head, size_t num_smaller, size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng, double* result) { size_t num_orig_samples, i; /* Calculate how many samples have to be drawn from xs_head */ num_orig_samples = (size_t) plfit_rbinom(num_samples, num_smaller / (double)n, rng); /* Draw the samples from xs_head */ for (i = 0; i < num_orig_samples; i++, result++) { *result = xs_head[(size_t)plfit_runif(0, num_smaller, rng)]; } /* Draw the remaining samples from the fitted distribution */ PLFIT_CHECK(plfit_rpareto_array(xmin, alpha-1, num_samples-num_orig_samples, rng, result)); return PLFIT_SUCCESS; } int plfit_resample_continuous(double* xs, size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng, double* result) { double *xs_head; size_t num_smaller = 0; int retval; /* Extract the head of xs that contains elements smaller than xmin */ xs_head = extract_smaller(xs, xs+n, xmin, &num_smaller); if (xs_head == 0) PLFIT_ERROR("cannot resample continuous dataset", PLFIT_ENOMEM); retval = plfit_i_resample_continuous(xs_head, num_smaller, n, alpha, xmin, num_samples, rng, result); /* Free xs_head; we don't need it any more */ free(xs_head); return retval; } static int plfit_i_resample_discrete(double* xs_head, size_t num_smaller, size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng, double* result) { size_t num_orig_samples, i; /* Calculate how many samples have to be drawn from xs_head */ num_orig_samples = (size_t) plfit_rbinom(num_samples, num_smaller / (double)n, rng); /* Draw the samples from xs_head */ for (i = 0; i < num_orig_samples; i++, result++) { *result = xs_head[(size_t)plfit_runif(0, num_smaller, rng)]; } /* Draw the remaining samples from the fitted distribution */ PLFIT_CHECK(plfit_rzeta_array((long int)xmin, alpha, num_samples-num_orig_samples, rng, result)); return PLFIT_SUCCESS; } int plfit_resample_discrete(double* xs, size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng, double* result) { double *xs_head; size_t num_smaller = 0; int retval; /* Extract the head of xs that contains elements smaller than xmin */ xs_head = extract_smaller(xs, xs+n, xmin, &num_smaller); if (xs_head == 0) PLFIT_ERROR("cannot resample discrete dataset", PLFIT_ENOMEM); retval = plfit_i_resample_discrete(xs_head, num_smaller, n, alpha, xmin, num_samples, rng, result); /* Free xs_head; we don't need it any more */ free(xs_head); return retval; } /******** calculating the p-value of a fitted model only *******/ int plfit_calculate_p_value_continuous(double* xs, size_t n, const plfit_continuous_options_t* options, plfit_bool_t xmin_fixed, plfit_result_t *result) { double* xs_copy; PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy)); PLFIT_CHECK(plfit_i_calculate_p_value_continuous(xs_copy, n, options, xmin_fixed, result)); free(xs_copy); return PLFIT_SUCCESS; } int plfit_calculate_p_value_discrete(double* xs, size_t n, const plfit_discrete_options_t* options, plfit_bool_t xmin_fixed, plfit_result_t *result) { double* xs_copy; PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy)); PLFIT_CHECK(plfit_i_calculate_p_value_discrete(xs_copy, n, options, xmin_fixed, result)); free(xs_copy); return PLFIT_SUCCESS; }