/* * Czech language stemmer - full (aggressive) version * * Source obtained from https://snowballstem.org/algorithms/czech/stemmer.html, under the * BSD public license, created by Ljiljana Dolamic & Jacques Savoy in 2009. Thank you ❤️ */ routines ( RV R1 palatalise mark_regions do_possessive do_case do_comparative do_diminutive do_augmentative do_derivational do_deriv_single do_aggressive ) externals ( stem ) integers ( pV p1 ) groupings ( v ) stringescapes {} stringdef a' '{U+00E1}' stringdef c^ '{U+010D}' stringdef d^ '{U+010F}' stringdef e' '{U+00E9}' stringdef e^ '{U+011B}' stringdef i' '{U+00ED}' stringdef n^ '{U+0148}' stringdef o' '{U+00F3}' stringdef r^ '{U+0159}' stringdef s^ '{U+0161}' stringdef t^ '{U+0165}' stringdef u' '{U+00FA}' stringdef u* '{U+016F}' stringdef y' '{U+00FD}' stringdef z^ '{U+017E}' define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' define mark_regions as ( $pV = limit $p1 = limit do ( gopast non-v setmark pV gopast non-v gopast v setmark p1 ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define palatalise as ( [substring] RV among ( 'ci' 'ce' '{c^}i' '{c^}' (<- 'k') 'zi' 'ze' '{z^}i' '{z^}e' (<- 'h') '{c^}t{e^}' '{c^}ti' '{c^}t{e'}' (<- 'ck') '{s^}t{e^}' '{s^}ti' '{s^}t{e'}' (<- 'sk') ) ) define do_possessive as ( [substring] RV among ( 'ov' '{u*}v' (delete) 'in' ( delete try palatalise ) ) ) define do_case as ( [substring] among ( 'atech' '{e^}tem' 'at{u*}m' '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' 'ata' 'aty' 'ama' 'ami' 'ovi' 'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou' 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' (delete) 'ech' 'ich' '{i'}ch' '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi' 'emi' 'iho' 'imu' '{e'}m' '{i'}m' 'es' 'e' 'i' '{i'}' '{e^}' ( delete try palatalise ) 'em' ( <- 'e' try palatalise ) ) ) define do_derivational as ( [substring] R1 among ( 'obinec' 'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k' '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin' '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k' 'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv' '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as' 'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn' '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk' (delete) 'ion{a'}{r^}' 'inec' 'itel' 'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb' 'ic' 'in' 'it' 'iv' ( <- 'i' palatalise ) 'enic' 'ec' 'en' ( <- 'e' palatalise ) '{e'}{r^}' ( <- '{e'}' palatalise ) '{e^}n' ( <- '{e^}' palatalise ) '{i'}rn' '{i'}{r^}' '{i'}n' ( <- '{i'}' palatalise ) ) ) define do_deriv_single as ( [substring] among ( 'c' '{c^}' 'k' 'l' 'n' 't' (delete) ) ) define do_augmentative as ( [substring] among ( 'ajzn' '{a'}k' (delete) 'izn' 'isk' ( <- 'i' palatalise ) ) ) define do_diminutive as ( [substring] among ( 'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek' 'anek' 'onek' 'unek' '{a'}nek' 'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk' '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk' '{a'}tk' '{a'}nk' 'u{s^}k' 'k' (delete) 'e{c^}ek' 'enek' 'ek' ( <- 'e' palatalise ) '{e'}{c^}ek' '{e'}k' ( <- '{e'}' palatalise ) 'i{c^}ek' 'inek' 'ik' ( <- 'i' palatalise ) '{i'}{c^}ek' '{i'}k' ( <- '{i'}' palatalise ) '{a'}k' (<- '{a'}') 'ak' (<- 'a') 'ok' (<- 'o') 'uk' (<- 'u') ) ) define do_comparative as ( [substring] among ( '{e^}j{s^}' ( <- '{e^}' palatalise ) 'ej{s^}' ( <- 'e' palatalise ) ) ) define do_aggressive as ( do do_comparative do do_diminutive do do_augmentative do_derivational or do_deriv_single ) ) define stem as ( do mark_regions backwards ( do_case do_possessive // light and aggressive are the same to this point // comment next line for light stemmer do_aggressive ) ) // Ljiljana Dolamic and Jacques Savoy. 2009. // Indexing and stemming approaches for the Czech language. // Inf. Process. Manage. 45, 6 (November 2009), 714-720. // http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt // http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt