/*------------------------------------------------------------------------- * * Copyright (c) 2018-2021, Darafei Praliaskouski * Copyright (c) 2016, Paul Ramsey * *------------------------------------------------------------------------*/ #include "liblwgeom_internal.h" /* * When clustering lists with NULL or EMPTY elements, they will get this as * their cluster number. (All the other clusters will be non-negative) */ #define KMEANS_NULL_CLUSTER -1 /* * If the algorithm doesn't converge within this number of iterations, * it will return with a failure error code. */ #define KMEANS_MAX_ITERATIONS 1000 static uint32_t kmeans(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t min_k, double max_radius); inline static double distance3d_sqr_pt4d_pt4d(const POINT4D *p1, const POINT4D *p2) { double hside = p2->x - p1->x; double vside = p2->y - p1->y; double zside = p2->z - p1->z; return hside * hside + vside * vside + zside * zside; } /* Split the clusters that need to be split */ static uint32_t improve_structure(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k, double max_radius) { /* Input check: radius limit should be measurable */ if (max_radius <= 0) return k; double max_radius_sq = max_radius * max_radius; /* Do we have the big clusters to split at all? */ uint32_t first_cluster_to_split = 0; for (; first_cluster_to_split < k; first_cluster_to_split++) if (radii[first_cluster_to_split] > max_radius_sq) break; if (first_cluster_to_split == k) return k; POINT4D *temp_objs = lwalloc(sizeof(POINT4D) * n); uint32_t *temp_clusters = lwalloc(sizeof(uint32_t) * n); double *temp_radii = lwalloc(sizeof(double) * n); POINT4D *temp_centers = lwalloc(sizeof(POINT4D) * n); uint32_t new_k = k; for (uint32_t cluster = first_cluster_to_split; cluster < k; cluster++) { if (radii[cluster] <= max_radius_sq) continue; /* copy cluster alone */ uint32_t cluster_size = 0; for (uint32_t i = 0; i < n; i++) if (clusters[i] == cluster) temp_objs[cluster_size++] = objs[i]; if (cluster_size <= 1) continue; /* run 2-means on the cluster */ kmeans(temp_objs, temp_clusters, cluster_size, temp_centers, temp_radii, 2, 0); /* replace cluster with split */ uint32_t d = 0; for (uint32_t i = 0; i < n; i++) if (clusters[i] == cluster) if (temp_clusters[d++]) clusters[i] = new_k; centers[cluster] = temp_centers[0]; centers[new_k] = temp_centers[1]; radii[cluster] = temp_radii[0]; radii[new_k] = temp_radii[1]; new_k++; } lwfree(temp_centers); lwfree(temp_radii); lwfree(temp_clusters); lwfree(temp_objs); return new_k; } /* Refresh mapping of point to closest cluster */ static uint8_t update_r(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k) { uint8_t converged = LW_TRUE; if (radii) memset(radii, 0, sizeof(double) * k); for (uint32_t i = 0; i < n; i++) { POINT4D obj = objs[i]; /* Initialize with distance to first cluster */ double curr_distance = distance3d_sqr_pt4d_pt4d(&obj, ¢ers[0]); uint32_t curr_cluster = 0; /* Check all other cluster centers and find the nearest */ for (uint32_t cluster = 1; cluster < k; cluster++) { double distance = distance3d_sqr_pt4d_pt4d(&obj, ¢ers[cluster]); if (distance < curr_distance) { curr_distance = distance; curr_cluster = cluster; } } /* Store the nearest cluster this object is in */ if (clusters[i] != curr_cluster) { converged = LW_FALSE; clusters[i] = curr_cluster; } if (radii) if (radii[curr_cluster] < curr_distance) radii[curr_cluster] = curr_distance; } return converged; } /* Refresh cluster centroids based on all of their objects */ static void update_means(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, uint32_t k) { memset(centers, 0, sizeof(POINT4D) * k); /* calculate weighted sum */ for (uint32_t i = 0; i < n; i++) { uint32_t cluster = clusters[i]; centers[cluster].x += objs[i].x * objs[i].m; centers[cluster].y += objs[i].y * objs[i].m; centers[cluster].z += objs[i].z * objs[i].m; centers[cluster].m += objs[i].m; } /* divide by weight to get average */ for (uint32_t i = 0; i < k; i++) { if (centers[i].m) { centers[i].x /= centers[i].m; centers[i].y /= centers[i].m; centers[i].z /= centers[i].m; } } } /* Assign initial clusters centroids heuristically */ static void kmeans_init(POINT4D *objs, uint32_t n, POINT4D *centers, uint32_t k) { double *distances; uint32_t p1 = 0, p2 = 0; uint32_t duplicate_count = 1; /* a point is a duplicate of itself */ double max_dst = -1; /* k=0, k=1: any point will do */ assert(n > 0); if (k < 2) { centers[0] = objs[0]; return; } /* k >= 2: find two distant points greedily */ for (uint32_t i = 1; i < n; i++) { /* if we found a larger distance, replace our choice */ double dst_p1 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p1]); double dst_p2 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p2]); if ((dst_p1 > max_dst) || (dst_p2 > max_dst)) { if (dst_p1 > dst_p2) { max_dst = dst_p1; p2 = i; } else { max_dst = dst_p2; p1 = i; } } if ((dst_p1 == 0) || (dst_p2 == 0)) duplicate_count++; } if (duplicate_count > 1) lwnotice( "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested", __func__, duplicate_count); /* by now two points should be found and non-same */ assert(p1 != p2 && max_dst >= 0); /* accept these two points */ centers[0] = objs[p1]; centers[1] = objs[p2]; if (k > 2) { /* array of minimum distance to a point from accepted cluster centers */ distances = lwalloc(sizeof(double) * n); /* initialize array with distance to first object */ for (uint32_t j = 0; j < n; j++) distances[j] = distance3d_sqr_pt4d_pt4d(&objs[j], ¢ers[0]); distances[p1] = -1; distances[p2] = -1; /* loop i on clusters, skip 0 and 1 as found already */ for (uint32_t i = 2; i < k; i++) { uint32_t candidate_center = 0; double max_distance = -DBL_MAX; /* loop j on objs */ for (uint32_t j = 0; j < n; j++) { /* empty objs and accepted clusters are already marked with distance = -1 */ if (distances[j] < 0) continue; /* update minimal distance with previosuly accepted cluster */ double current_distance = distance3d_sqr_pt4d_pt4d(&objs[j], ¢ers[i - 1]); if (current_distance < distances[j]) distances[j] = current_distance; /* greedily take a point that's farthest from any of accepted clusters */ if (distances[j] > max_distance) { candidate_center = j; max_distance = distances[j]; } } /* Checked earlier by counting entries on input, just in case */ assert(max_distance >= 0); /* accept candidate to centers */ distances[candidate_center] = -1; /* Copy the point coordinates into the initial centers array * Centers array is an array of pointers to points, not an array of points */ centers[i] = objs[candidate_center]; } lwfree(distances); } } static uint32_t kmeans(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t min_k, double max_radius) { uint8_t converged = LW_FALSE; uint32_t cur_k = min_k; kmeans_init(objs, n, centers, cur_k); /* One iteration of kmeans needs to happen without shortcuts to fully initialize structures */ update_r(objs, clusters, n, centers, radii, cur_k); update_means(objs, clusters, n, centers, cur_k); for (uint32_t t = 0; t < KMEANS_MAX_ITERATIONS; t++) { /* Standard KMeans loop */ for (uint32_t i = 0; i < KMEANS_MAX_ITERATIONS; i++) { LW_ON_INTERRUPT(break); converged = update_r(objs, clusters, n, centers, radii, cur_k); if (converged) break; update_means(objs, clusters, n, centers, cur_k); } if (!converged || !max_radius) break; /* XMeans-inspired improve_structure pass to split clusters bigger than limit into 2 */ uint32_t new_k = improve_structure(objs, clusters, n, centers, radii, cur_k, max_radius); if (new_k == cur_k) break; cur_k = new_k; } if (!converged) { lwerror("%s did not converge after %d iterations", __func__, KMEANS_MAX_ITERATIONS); return 0; } return cur_k; } int * lwgeom_cluster_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k, double max_radius) { uint32_t num_non_empty = 0; assert(k > 0); assert(n > 0); assert(max_radius >= 0); assert(geoms); if (n < k) { lwerror( "%s: number of geometries is less than the number of clusters requested, not all clusters will get data", __func__); k = n; } /* An array of objects to be analyzed. */ POINT4D *objs_dense = lwalloc(sizeof(POINT4D) * n); /* Array to mark unclusterable objects. Will be returned as KMEANS_NULL_CLUSTER. */ uint8_t *geom_valid = lwalloc(sizeof(uint8_t) * n); memset(geom_valid, 0, sizeof(uint8_t) * n); /* Array to fill in with cluster numbers. */ int *clusters = lwalloc(sizeof(int) * n); for (uint32_t i = 0; i < n; i++) clusters[i] = KMEANS_NULL_CLUSTER; /* An array of clusters centers for the algorithm. */ POINT4D *centers = lwalloc(sizeof(POINT4D) * n); memset(centers, 0, sizeof(POINT4D) * n); /* An array of clusters radii for the algorithm. */ double *radii = lwalloc(sizeof(double) * n); memset(radii, 0, sizeof(double) * n); /* Prepare the list of object pointers for K-means */ for (uint32_t i = 0; i < n; i++) { const LWGEOM *geom = geoms[i]; /* Unset M values will be 1 */ POINT4D out = {0, 0, 0, 1}; /* Null/empty geometries get geom_valid=LW_FALSE set earlier with memset */ if ((!geom) || lwgeom_is_empty(geom)) continue; /* If the input is a point, use its coordinates */ if (lwgeom_get_type(geom) == POINTTYPE) { out.x = lwpoint_get_x(lwgeom_as_lwpoint(geom)); out.y = lwpoint_get_y(lwgeom_as_lwpoint(geom)); if (lwgeom_has_z(geom)) out.z = lwpoint_get_z(lwgeom_as_lwpoint(geom)); if (lwgeom_has_m(geom)) { out.m = lwpoint_get_m(lwgeom_as_lwpoint(geom)); if (out.m <= 0) lwerror("%s has an input point geometry with weight in M less or equal to 0", __func__); } } else if (!lwgeom_has_z(geom)) { /* For 2D, we can take a centroid */ LWGEOM *centroid = lwgeom_centroid(geom); if (!centroid) continue; if (lwgeom_is_empty(centroid)) { lwgeom_free(centroid); continue; } out.x = lwpoint_get_x(lwgeom_as_lwpoint(centroid)); out.y = lwpoint_get_y(lwgeom_as_lwpoint(centroid)); lwgeom_free(centroid); } else { /* For 3D non-point, we can have a box center */ const GBOX *box = lwgeom_get_bbox(geom); if (!gbox_is_valid(box)) continue; out.x = (box->xmax + box->xmin) / 2; out.y = (box->ymax + box->ymin) / 2; out.z = (box->zmax + box->zmin) / 2; } geom_valid[i] = LW_TRUE; objs_dense[num_non_empty++] = out; } if (num_non_empty < k) { lwnotice( "%s: number of non-empty geometries (%d) is less than the number of clusters (%d) requested, not all clusters will get data", __func__, num_non_empty, k); k = num_non_empty; } uint8_t converged = LW_TRUE; if (num_non_empty > 0) { uint32_t *clusters_dense = lwalloc(sizeof(uint32_t) * num_non_empty); memset(clusters_dense, 0, sizeof(uint32_t) * num_non_empty); uint32_t output_cluster_count = kmeans(objs_dense, clusters_dense, num_non_empty, centers, radii, k, max_radius); uint32_t d = 0; for (uint32_t i = 0; i < n; i++) if (geom_valid[i]) clusters[i] = (int)clusters_dense[d++]; converged = output_cluster_count > 0; lwfree(clusters_dense); } /* Before error handling, might as well clean up all the inputs */ lwfree(objs_dense); lwfree(centers); lwfree(geom_valid); lwfree(radii); /* Good result */ if (converged) return clusters; /* Bad result, not going to need the answer */ lwfree(clusters); return NULL; }