#!/usr/bin/env python """ A feature extractor for named eneity recognition (NER). Copyright 2010,2011 Naoaki Okazaki. """ # Separator of field values. separator = ' ' # Field names of the input data. fields = 'y w pos chk' import crfutils def get_shape(token): r = '' for c in token: if c.isupper(): r += 'U' elif c.islower(): r += 'L' elif c.isdigit(): r += 'D' elif c in ('.', ','): r += '.' elif c in (';', ':', '?', '!'): r += ';' elif c in ('+', '-', '*', '/', '=', '|', '_'): r += '-' elif c in ('(', '{', '[', '<'): r += '(' elif c in (')', '}', ']', '>'): r += ')' else: r += c return r def degenerate(src): dst = '' for c in src: if not dst or dst[-1] != c: dst += c return dst def get_type(token): T = ( 'AllUpper', 'AllDigit', 'AllSymbol', 'AllUpperDigit', 'AllUpperSymbol', 'AllDigitSymbol', 'AllUpperDigitSymbol', 'InitUpper', 'AllLetter', 'AllAlnum', ) R = set(T) if not token: return 'EMPTY' for i in range(len(token)): c = token[i] if c.isupper(): R.discard('AllDigit') R.discard('AllSymbol') R.discard('AllDigitSymbol') elif c.isdigit() or c in (',', '.'): R.discard('AllUpper') R.discard('AllSymbol') R.discard('AllUpperSymbol') R.discard('AllLetter') elif c.islower(): R.discard('AllUpper') R.discard('AllDigit') R.discard('AllSymbol') R.discard('AllUpperDigit') R.discard('AllUpperSymbol') R.discard('AllDigitSymbol') R.discard('AllUpperDigitSymbol') else: R.discard('AllUpper') R.discard('AllDigit') R.discard('AllUpperDigit') R.discard('AllLetter') R.discard('AllAlnum') if i == 0 and not c.isupper(): R.discard('InitUpper') for tag in T: if tag in R: return tag return 'NO' def get_2d(token): return len(token) == 2 and token.isdigit() def get_4d(token): return len(token) == 4 and token.isdigit() def get_da(token): bd = False ba = False for c in token: if c.isdigit(): bd = True elif c.isalpha(): ba = True else: return False return bd and ba def get_dand(token, p): bd = False bdd = False for c in token: if c.isdigit(): bd = True elif c == p: bdd = True else: return False return bd and bdd def get_all_other(token): for c in token: if c.isalnum(): return False return True def get_capperiod(token): return len(token) == 2 and token[0].isupper() and token[1] == '.' def contains_upper(token): b = False for c in token: b |= c.isupper() return b def contains_lower(token): b = False for c in token: b |= c.islower() return b def contains_alpha(token): b = False for c in token: b |= c.isalpha() return b def contains_digit(token): b = False for c in token: b |= c.isdigit() return b def contains_symbol(token): b = False for c in token: b |= ~c.isalnum() return b def b(v): return 'yes' if v else 'no' def observation(v, defval=''): # Lowercased token. v['wl'] = v['w'].lower() # Token shape. v['shape'] = get_shape(v['w']) # Token shape degenerated. v['shaped'] = degenerate(v['shape']) # Token type. v['type'] = get_type(v['w']) # Prefixes (length between one to four). v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval # Suffixes (length between one to four). v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval # Two digits v['2d'] = b(get_2d(v['w'])) # Four digits. v['4d'] = b(get_4d(v['w'])) # Alphanumeric token. v['d&a'] = b(get_da(v['w'])) # Digits and '-'. v['d&-'] = b(get_dand(v['w'], '-')) # Digits and '/'. v['d&/'] = b(get_dand(v['w'], '/')) # Digits and ','. v['d&,'] = b(get_dand(v['w'], ',')) # Digits and '.'. v['d&.'] = b(get_dand(v['w'], '.')) # A uppercase letter followed by '.' v['up'] = b(get_capperiod(v['w'])) # An initial uppercase letter. v['iu'] = b(v['w'] and v['w'][0].isupper()) # All uppercase letters. v['au'] = b(v['w'].isupper()) # All lowercase letters. v['al'] = b(v['w'].islower()) # All digit letters. v['ad'] = b(v['w'].isdigit()) # All other (non-alphanumeric) letters. v['ao'] = b(get_all_other(v['w'])) # Contains a uppercase letter. v['cu'] = b(contains_upper(v['w'])) # Contains a lowercase letter. v['cl'] = b(contains_lower(v['w'])) # Contains a alphabet letter. v['ca'] = b(contains_alpha(v['w'])) # Contains a digit. v['cd'] = b(contains_digit(v['w'])) # Contains a symbol. v['cs'] = b(contains_symbol(v['w'])) def disjunctive(X, t, field, begin, end): name = '%s[%d..%d]' % (field, begin, end) for offset in range(begin, end+1): p = t + offset if p not in range(0, len(X)): continue X[t]['F'].append('%s=%s' % (name, X[p][field])) U = [ 'w', 'wl', 'pos', 'chk', 'shape', 'shaped', 'type', 'p1', 'p2', 'p3', 'p4', 's1', 's2', 's3', 's4', '2d', '4d', 'd&a', 'd&-', 'd&/', 'd&,', 'd&.', 'up', 'iu', 'au', 'al', 'ad', 'ao', 'cu', 'cl', 'ca', 'cd', 'cs', ] B = ['w', 'pos', 'chk', 'shaped', 'type'] templates = [] for name in U: templates += [((name, i),) for i in range(-2, 3)] for name in B: templates += [((name, i), (name, i+1)) for i in range(-2, 2)] def feature_extractor(X): # Append observations. for x in X: observation(x) # Apply the feature templates. crfutils.apply_templates(X, templates) # Append disjunctive features. for t in range(len(X)): disjunctive(X, t, 'w', -4, -1) disjunctive(X, t, 'w', 1, 4) # Append BOS and EOS features. if X: X[0]['F'].append('__BOS__') X[-1]['F'].append('__EOS__') if __name__ == '__main__': crfutils.main(feature_extractor, fields=fields, sep=separator)