/*! * \file * \brief Frequent/Closed itemset discovery routines * * This file contains the code for finding frequent/closed itemests. These routines * are implemented using a call-back mechanism to deal with the discovered itemsets. * * \date 6/13/2008 * \author George Karypis * \version\verbatim $Id: itemsets.c 11075 2011-11-11 22:31:52Z karypis $ \endverbatim */ #include "GKlib.h" /*-------------------------------------------------------------*/ /*! Data structures for use within this module */ /*-------------------------------------------------------------*/ typedef struct { int minfreq; /* the minimum frequency of a pattern */ int maxfreq; /* the maximum frequency of a pattern */ int minlen; /* the minimum length of the requested pattern */ int maxlen; /* the maximum length of the requested pattern */ int tnitems; /* the initial range of the item space */ /* the call-back function */ void (*callback)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids); void *stateptr; /* the user-supplied pointer to pass to the callback */ /* workspace variables */ int *rmarker; gk_ikv_t *cand; } isparams_t; /*-------------------------------------------------------------*/ /*! Prototypes for this module */ /*-------------------------------------------------------------*/ void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat, int preflen, int *prefix); gk_csr_t *itemsets_project_matrix(isparams_t *param, gk_csr_t *mat, int cid); /*************************************************************************/ /*! The entry point of the frequent itemset discovery code */ /*************************************************************************/ void gk_find_frequent_itemsets(int ntrans, ssize_t *tranptr, int *tranind, int minfreq, int maxfreq, int minlen, int maxlen, void (*process_itemset)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids), void *stateptr) { ssize_t i; gk_csr_t *mat, *pmat; isparams_t params; int *pattern; /* Create the matrix */ mat = gk_csr_Create(); mat->nrows = ntrans; mat->ncols = tranind[gk_iargmax(tranptr[ntrans], tranind)]+1; mat->rowptr = gk_zcopy(ntrans+1, tranptr, gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr")); mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind")); mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids")); /* Setup the parameters */ params.minfreq = minfreq; params.maxfreq = (maxfreq == -1 ? mat->nrows : maxfreq); params.minlen = minlen; params.maxlen = (maxlen == -1 ? mat->ncols : maxlen); params.tnitems = mat->ncols; params.callback = process_itemset; params.stateptr = stateptr; params.rmarker = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker"); params.cand = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand"); /* Perform the initial projection */ gk_csr_CreateIndex(mat, GK_CSR_COL); pmat = itemsets_project_matrix(¶ms, mat, -1); gk_csr_Free(&mat); pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern"); itemsets_find_frequent_itemsets(¶ms, pmat, 0, pattern); gk_csr_Free(&pmat); gk_free((void **)&pattern, ¶ms.rmarker, ¶ms.cand, LTERM); } /*************************************************************************/ /*! The recursive routine for DFS-based frequent pattern discovery */ /*************************************************************************/ void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat, int preflen, int *prefix) { ssize_t i; gk_csr_t *cmat; /* Project each frequent column */ for (i=0; incols; i++) { prefix[preflen] = mat->colids[i]; if (preflen+1 >= params->minlen) (*params->callback)(params->stateptr, preflen+1, prefix, mat->colptr[i+1]-mat->colptr[i], mat->colind+mat->colptr[i]); if (preflen+1 < params->maxlen) { cmat = itemsets_project_matrix(params, mat, i); itemsets_find_frequent_itemsets(params, cmat, preflen+1, prefix); gk_csr_Free(&cmat); } } } /******************************************************************************/ /*! This function projects a matrix w.r.t. to a particular column. It performs the following steps: - Determines the length of each column that is remaining - Sorts the columns in increasing length - Creates a column-based version of the matrix with the proper column ordering and renamed rowids. */ /*******************************************************************************/ gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid) { ssize_t i, j, k, ii, pnnz; int nrows, ncols, pnrows, pncols; ssize_t *colptr, *pcolptr; int *colind, *colids, *pcolind, *pcolids, *rmarker; gk_csr_t *pmat; gk_ikv_t *cand; nrows = mat->nrows; ncols = mat->ncols; colptr = mat->colptr; colind = mat->colind; colids = mat->colids; rmarker = params->rmarker; cand = params->cand; /* Allocate space for the projected matrix based on what you know thus far */ pmat = gk_csr_Create(); pmat->nrows = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]); /* Mark the rows that will be kept and determine the prowids */ if (cid == -1) { /* Initial projection */ gk_iset(nrows, 1, rmarker); } else { /* The other projections */ for (i=colptr[cid]; i= params->minfreq && k <= params->maxfreq) { cand[pncols].val = i; cand[pncols++].key = k; pnnz += k; } } /* Sort the columns in increasing order */ gk_ikvsorti(pncols, cand); /* Allocate space for the remaining fields of the projected matrix */ pmat->ncols = pncols; pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids"); pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr"); pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind"); /* Populate the projected matrix */ pcolptr[0] = 0; for (pnnz=0, ii=0; ii