/** * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. */ #ifndef _FA_FSM_CONST_H_ #define _FA_FSM_CONST_H_ #include "FAConfig.h" class FAFsmConst { public: // type of data structure enum { TYPE_RS_NFA = 0, TYPE_POS_RS_NFA, TYPE_RS_DFA, TYPE_MOORE_DFA, TYPE_MOORE_MULTI_DFA, TYPE_MULTI_MAP, TYPE_MEALY_NFA, TYPE_MEALY_DFA, TYPE_ARRAY, TYPE_FLOAT_ARRAY, TYPE_COUNT, }; // automaton or mmap container type enum { MODE_INT = 0, // general int-based container MODE_PACK_TRIV, // trivial packed container MODE_PACK_MPH, // MPH-based packed container MODE_PACK_FIXED, // fixed size array based representation MODE_COUNT, }; // modes of reaction map in-memory representations enum { OWS_MODE_TRIV = 0, OWS_MODE_UNIQ, OWS_MODE_DUMP, OWS_MODE_COUNT, }; // types of on-disk representations enum { FORMAT_TXT = 0, // ASCII text FORMAT_DUMP, // contiguous memory dump FORMAT_COUNT, }; // specifies different types of labels of regular expressions enum { LABEL_DIGIT = 0, // uint-based regular expressions LABEL_CHAR, // character based regular expressions LABEL_WRE, // WRE-token based regular expressions LABEL_COUNT, }; // default values for some common input weights enum { IW_ANY = 0, // /./ IW_L_ANCHOR = 1, // /^/ IW_R_ANCHOR = 2, // /$/ IW_EPSILON = 3, // epsilon IW_EOS = 4, // end-of-sequence IW_COUNT, }; // one for all Dead-State value enum { NFA_DEAD_STATE = -1, DFA_DEAD_STATE = -2, // as -1 indicates the absence of transition }; /// types of digitizers enum { DIGITIZER_TEXT = 0, DIGITIZER_TAGS, DIGITIZER_DCTS, DIGITIZER_COUNT, }; // WRE token type bit-masks enum { WRE_TT_TEXT = 1, WRE_TT_TAGS = 2, WRE_TT_DCTS = 4, WRE_TT_DEFAULT = WRE_TT_TEXT | WRE_TT_TAGS | WRE_TT_DCTS, }; // types of WRE rules enum { WRE_TYPE_RS = 0, // accepting rules WRE_TYPE_MOORE, // classifying rules WRE_TYPE_MEALY, // extracting rules WRE_TYPE_COUNT, }; // packed WRE configuration constants enum { WRE_CONF_WRE_TYPE = 0, // WRE-type index WRE_CONF_TOKEN_TYPE, // token-type index WRE_CONF_TAG_OW_BASE, // ... WRE_CONF_TXT_DIG, // Txt digitizer offset index WRE_CONF_DCT_DIG, // Dxt digitizer offset index WRE_CONF_FSM1, // ... WRE_CONF_FSM2, WRE_CONF_TRBR, WRE_CONF_COUNT, // configuration size }; // types of input chain procesing directions enum { DIR_L2R = 0, // left to right DIR_R2L, // right to left DIR_AFF, // affix: last, first, last - 1, first + 1, ... DIR_COUNT, }; // transformation types enum { TR_HYPH_REDUP = 0, // hyphenated reduplication TR_HYPH_REDUP_REV, // reversed hyphenated reduplication TR_PREFIX, // puts recognized prefixes to the end of the word TR_PREFIX_REV, // reverse prefix transformation TR_UCF, // upper-case-first transformation TR_UCF_REV, // reverse upper-case-first transformation TR_COUNT, }; // types of automaton interpreters enum { INT_TRIV = 0, // trivial INT_FNFA, // factor NFA based INT_POS, // match positions reconstruction INT_TRBR, // triangular bracket extraction INT_WG, // word-guesser INT_SUBST, // substituter INT_SUFFIX, // suffix rules interpreter INT_COUNT, }; // kinds of functions enum { FUNC_W2T = 0, // returns tags for the given word-form FUNC_W2B, // returns base-forms from a word-form FUNC_B2W, // returns word-forms from a base-form FUNC_W2W, // returns word-forms from a word-form FUNC_TRS, // transformation of FATransformCA_t < Ty > type FUNC_W2S, // makes segmentation of the input word FUNC_WRE, // WRE syntax rules FUNC_WT2B, // returns base-forms for the word with the given tag FUNC_B2WT, // returns word-forms with the given tag from the base FUNC_WTT2W, // a superposition of FUNC_B2WT (FUNC_WT2B (w)) FUNC_TAG_DICT, // tag-dictionary function FUNC_W2H, // word hyphenation function FUNC_POS_DICT, // POS tagging (Tag/Prob) dictionary function FUNC_B2T, // returns all word-form tags for the given base-form FUNC_T2TB, // for the given tag returns the base-form tag(s) FUNC_TB2T, // for the given base-form tag returns word-form tags FUNC_W2TP, // by the given word returns tags and p(t|w) values FUNC_W2TPL, // by the following word returns tags and p(t|w-1) values FUNC_W2TPR, // by the next word returns tags and p(t|w+1) values FUNC_WBD, // builds rules and actions for word boundary detection FUNC_GLOBAL, // global client specific configuration FUNC_W2H_ALT, // additional alternative word hyphenation function FUNC_T2P, // returns a probability of a tag, P(T) FUNC_TT2P, // returns a probability of a tag, P(T|T-1) FUNC_TTT2P, // returns a probability of a tag, P(T|T-2,T-1) FUNC_NORM_RULES,// concatenation rules for NE normalization FUNC_NORM_DICT,// normalization dictionary FUNC_EMIT, // NE emission rules FUNC_OIC_RULES,// offensive in context WRE rules FUNC_CSS_RULES,// CSS suggestion WRE rules FUNC_W2V, // returns spelling variants for a word-form FUNC_W2P, // returns word's probability p(w) FUNC_N2TP, // by the given ngram returns tags and p(t|ng) values FUNC_LAD, // Language Auto-Detection (LAD) FUNC_U2L, // returns a set of languages and possibly scores for the given url FUNC_COUNT, }; // parameter names for run-time containers enum { PARAM_IN_TR = 0, // Input transformation type PARAM_OUT_TR, // Output transformation type PARAM_FSM, // input automaton (type is clear from context) PARAM_RSDFA, // RS DFA PARAM_STATE2OW, // Reaction PARAM_STATE2OWS, // Multiple reaction PARAM_ACTS, // Rule -> action map PARAM_FOLLOW, // Following positions map PARAM_POS2BEGINBR, // Position -> Begining Bracket map PARAM_POS2ENDBR, // Position -> Ending Bracket map PARAM_REVERSE, // Reverse input PARAM_DIRECTION, // Input direction PARAM_TRIM, // Trimming value for word-guesser PARAM_REDUP_DELIM, // Reduplication delimiter PARAM_PREF_DELIM, // Prefix delimiter PARAM_PREF_FSM, // Prefix automaton PARAM_MAP_MODE, // Container type for the action or some other map PARAM_MIN_LEN, // smallest length, e.g. compound, pattern, etc. PARAM_NO_TR, // Forbid transformation, even if it is specified PARAM_UCF_DELIM, // upper-case-first transformation delimiter PARAM_TOKEN_TYPE, // WRE token type PARAM_FSM_COUNT, // FSM count per rule PARAM_IGNORE_CASE, // process input in the lower case PARAM_DEFAULT_TAG, // default tag value PARAM_ARRAY, // Packed Array, for example: word id -> set id PARAM_MULTI_MAP, // Multi-Map, for example: set id -> set PARAM_FSM_TYPE, // specifies the type of FSM (if needed) PARAM_LEFT_ANCHOR, // special left Iw PARAM_RIGHT_ANCHOR,// special right Iw PARAM_TYPE, // structure type (WRE type for example) PARAM_TAG_OW_BASE, // base value for the tag Ow (WRE specific) PARAM_DICT_MODE, // classifier is working as a dictionary PARAM_MIN_LEN2, // other than PARAM_MIN_LEN minimal length PARAM_MIN_LEN3, // other than PARAM_MIN_LEN(2) minimal length PARAM_HYPH_TYPE, // defines hyphenation algorithm to be used PARAM_NORMALIZE, // inidicates whether normalization should be used PARAM_MAX_PROB, // the numrical value corresponding to 1 of the prob PARAM_DO_W2B, // do reductive stemming, e.g. at word-breaking PARAM_DEPTH, // depth, e.g. recursion depth ... PARAM_MAX_TAG, // maximum tag value PARAM_LOG_SCALE, // the log scale is being used PARAM_FLOAT_ARRAY, // array of floating point values PARAM_WORD, // word token tag PARAM_PUNKT, // punktuation tags PARAM_EOS, // end of sequence tag PARAM_EOP, // end of paragraph tag PARAM_USE_NFST, // indicates that NFST should be used PARAM_CHARMAP, // character map PARAM_WRE_CONF, // compiled WRE configuration dump PARAM_SUFFIX_FSM, // suffix automaton PARAM_MIN_UNI_PROB,// minimal unigram probability in % PARAM_XWORD, // complex token tag PARAM_SEG, // segment tag PARAM_IGNORE, // ignore tag PARAM_ORDER, // n-grams order PARAM_MIN_ORDER, // n-grams min backoff order PARAM_UNKNOWN, // UNKNOWN tag PARAM_MAX_COUNT, // maximum count value PARAM_RATIO, // ratio in % PARAM_RATIO2, // ratio in % PARAM_C2S_MAP, // character --> script map PARAM_S2L_MAP, // script --> language map PARAM_SCRIPT_MIN, // smallest tag value for scripts PARAM_SCRIPT_MAX, // biggest tag value for scripts PARAM_MAX_DISTANCE,// maximum distance value PARAM_MAX_PASS_COUNT,// maximum pass count PARAM_MAX_SCORE, // override score PARAM_THRESHOLD, // any threshold PARAM_ACT_DATA, // action data map PARAM_MAX_LENGTH, // maximum length, e.g. maximum token length PARAM_VERIFY_LDB_BIN, // if specified, requires a CRC32-like check for the LDB file to pass PARAM_TOKENIZATION_TYPE, // specifies which tokenization runtime should be used PARAM_COUNT, }; // parser type enum { PARSER_TRIV = 0, // refers to FAParser_triv_t interpreter PARSER_NEST = 1, // refers to FAParser_nest_t interpreter PARSER_WRE_LEX = 2,// fa_lex-like WRE rules }; // parser type enum { RESOLVE_MATCH_ALL = 0, // accepts all match results RESOLVE_MATCH_NEST = 1, // removes overlappings and same rule nested }; // length 2 trbr mapping enum { TRBR_LEFT = -1, // indicates left triangular bracket TRBR_RIGHT = -2, // indicates right triangular bracket }; // special characters for corpus IO enum { CHAR_WORD_DELIM = ' ', // words delimiter for corpus tools CHAR_TAG_DELIM = '/', // word/tag delimiter for tagged word CHAR_MWE_DELIM = '_', // simple words delimiter within MWE CHAR_SPACE = ' ', // normal space for MWEs }; // dictionary data mode type enum { DM_RAW = 0, // KEY -> RAW DATA ; no duplicate words DM_TAGS, // KEY -> TAGS ; input sorted by KEY DM_TAG_PROB, // KEY -> TAG, PROB ; input sorted by KEY DM_HYPH, // KEY -> FREQ, OWS ; input sorted by KEY DM_COUNT, }; // statistics type masks enum { STAT_TYPE_NONE = 0, // nothing STAT_TYPE_W = 1, // the word STAT_TYPE_WT = 2, // the word and tag STAT_TYPE_WTT = 4, // the word, tag and the following tag STAT_TYPE_TWT = 8, // the word, tag and the preceding tag STAT_TYPE_WTWT = 16, // the word, tag and the following word, tag STAT_TYPE_T = 32, // the tag STAT_TYPE_TT = 64, // the tag bigram STAT_TYPE_TTT = 128, // the tag trigram STAT_TYPE_TTTT = 256, // the tag fourgram STAT_TYPE_WW = 512, // the word bigram STAT_TYPE_WWW = 1024, // the word trigram STAT_TYPE_W_T = 2048, // the word and the following tag STAT_TYPE_TW = 4096, // the word and the preceding tag STAT_TYPE_DEFAULT = STAT_TYPE_WT, }; // default POS tag value enum { POS_TAG_DEFAULT = 1, }; // describes types of hyphneation enum { HYPH_TYPE_CORE = 0, // just a pattern based hyphneation HYPH_TYPE_W2H_W2S, // independent W2H and W2S HYPH_TYPE_W2S_W2H, // W2S and W2H for each segment HYPH_TYPE_COUNT, HYPH_TYPE_DEFAULT = HYPH_TYPE_CORE, }; // hyphenation procedfures names (other modifications are not allowed) enum { HYPH_CONFLICT = -2, // conflict between two or more patterns HYPH_UNKNOWN = -1, // uncovered position of text HYPH_NO_HYPH = 0, // no-hyphenation point HYPH_SIMPLE_HYPH = 1, // a simple hyphenation HYPH_ADD_BEFORE, // add letter before hyphen HYPH_CHANGE_BEFORE, // change letter before hyphen HYPH_DELETE_BEFORE, // delete letter before hyphen HYPH_CHANGE_AFTER, // change letter after hyphen HYPH_DEL_AND_CHANGE, // delete letter before and change after hyphen HYPH_DONT_CARE, HYPH_COUNT, }; enum { MIN_LOG_PROB = -80, MAX_LOG_PROB = 0, }; /// bit masks for state's transitions representation enum { TRS_NONE = 0x00, // no Dsts TRS_IMPL = 0x02, // Dst = Src + |Src| TRS_PARA = 0x04, // two parallel arrays, Iws is sorted TRS_IWIA = 0x06, // Iws indexed array of Dsts, Dst == Dsts[Iw - Base] TRS_RANGE = 0x01, // Iws ranges, each of which corresponds to one Dst }; /// word/sequence case types enum { CASE_ALL_LOWER = 0, // all letters are lower case only CASE_CAPITALIZED, // first is capital the rest if any are lower only CASE_ALL_UPPER, // all more than one letters are upper case only CASE_OTHER, // other cases (mixed case or words with numbers) }; // defaults in packed representation enum { TRIV_PACK_DEF_DST_SIZE = 3, // default dst size for triv packed }; // LDB bin validation enum { VALIDATION_VERSION = 0, VALIDATION_SIZE, VALIDATION_HASH, VALIDATION_COUNT, }; // character normalization method enum { NORMALIZE_DEFAULT = 0, NORMALIZE_PRESERVE_DIACRITICS = 1, NORMALIZE_REMOVE_DIACRITICS = 2, NORMALIZE_COUNT, }; // segmentation model types enum { TOKENIZE_DEFAULT = 0, TOKENIZE_WORDPIECE = 1, TOKENIZE_UNIGRAM_LM = 2, TOKENIZE_BPE = 3, TOKENIZE_BPE_OPT = 4, // optimized version of the BPE, prefers a single token match over // subtoken, assumes tokens are delimited with U+x2581 TOKENIZE_COUNT, }; }; #endif