/* * Authors: * - Assem Chelli, < assem [dot] ch [at] gmail > * - Abdelkrim Aries * */ stringescapes { } /* the Arabic letters in Unicode */ // Hamza stringdef o hex '621' // Hamza stringdef ao hex '623' // Hamza above Alef stringdef ao_ hex '625' // Hamza below Alef stringdef a~ hex '622' // Alef madda stringdef wo hex '624' // Hamza above waw stringdef yo hex '626' // Hamza above yeh // Letters stringdef a hex '627' // Alef stringdef a_ hex '649' // Alef Maksura stringdef b hex '628' // Beh stringdef t_ hex '629' // Teh_Marbuta stringdef t hex '62a' // Teh stringdef th hex '62b' // Theh stringdef j hex '62c' // Jeem stringdef h hex '62d' // Hah stringdef x hex '62e' // Khah stringdef d hex '62f' // Dal stringdef dz hex '630' // Thal stringdef r hex '631' // Reh stringdef z hex '632' // Zain stringdef s hex '633' // Seen stringdef sh hex '634' // Sheen stringdef c hex '635' // Sad stringdef dh hex '636' // Dad stringdef tt hex '637' // Tah stringdef zh hex '638' // Zah stringdef i hex '639' // Ain stringdef gh hex '63a' // Ghain stringdef f hex '641' // Feh stringdef q hex '642' // Qaf stringdef k hex '643' // Kaf stringdef l hex '644' // Lam stringdef m hex '645' // Meem stringdef n hex '646' // Noon stringdef e hex '647' // Heh stringdef w hex '648' // Waw stringdef y hex '64a' // Yeh // Diacritics stringdef aan hex '64b' // FatHatan stringdef uun hex '64c' // Dammatan stringdef iin hex '64d' // Kasratan stringdef aa hex '64e' // FatHa stringdef uu hex '64f' // Damma stringdef ii hex '650' // Kasra stringdef oo hex '652' // Sukun stringdef ~ hex '651' // Shadda // Hindu–Arabic numerals stringdef 0 hex '0660' stringdef 1 hex '0661' stringdef 2 hex '0662' stringdef 3 hex '0663' stringdef 4 hex '0664' stringdef 5 hex '0665' stringdef 6 hex '0666' stringdef 7 hex '0667' stringdef 8 hex '0668' stringdef 9 hex '0669' stringdef % hex '066a' // PERCENT stringdef . hex '066b' // DECIMAL stringdef ' hex '066c' // THOUSANDS // Kasheeda stringdef _ hex '640' // Kasheeda, Tatweel // Ponctuation marks stringdef , hex '060C' // COMMA stringdef ; hex '061B' // SEMICOLON stringdef ? hex '061F' // QUESTION // Shaped forms stringdef o1 hex 'fe80' // HAMZA stringdef ao1 hex 'fe83' // ALEF_HAMZA_ABOVE stringdef ao2 hex 'fe84' // ALEF_HAMZA_ABOVE stringdef ao_1 hex 'fe87' // ALEF_HAMZA_BELOW stringdef ao_2 hex 'fe88' // ALEF_HAMZA_BELOW stringdef yo1 hex 'fe8b' // YEH_HAMZA stringdef yo2 hex 'fe8c' // YEH_HAMZA stringdef yo3 hex 'fe89' // YEH_HAMZA stringdef yo4 hex 'fe8a' // YEH_HAMZA stringdef a~1 hex 'fe81' // ALEF_MADDA stringdef a~2 hex 'fe82' // ALEF_MADDA stringdef wo1 hex 'fe85' // WAW_HAMZA stringdef wo2 hex 'fe86' // WAW_HAMZA stringdef a1 hex 'fe8d' // ALEF stringdef a2 hex 'fe8e' // ALEF stringdef b1 hex 'fe8f' // BEH stringdef b2 hex 'fe90' // BEH stringdef b3 hex 'fe91' // BEH stringdef b4 hex 'fe92' // BEH stringdef t_1 hex 'fe93' // TEH_MARBUTA stringdef t_2 hex 'fe94' // TEH_MARBUTA stringdef t1 hex 'fe97' // TEH stringdef t2 hex 'fe98' // TEH stringdef t3 hex 'fe95' // TEH stringdef t4 hex 'fe96' // TEH stringdef th1 hex 'fe9b' // THEH stringdef th2 hex 'fe9c' // THEH stringdef th3 hex 'fe9a' // THEH stringdef th4 hex 'fe99' // THEH stringdef j1 hex 'fe9f' // JEEM stringdef j2 hex 'fea0' // JEEM stringdef j3 hex 'fe9d' // JEEM stringdef j4 hex 'fe9e' // JEEM stringdef h1 hex 'fea3' // HAH stringdef h2 hex 'fea4' // HAH stringdef h3 hex 'fea1' // HAH stringdef h4 hex 'fea2' // HAH stringdef x1 hex 'fea7' // KHAH stringdef x2 hex 'fea8' // KHAH stringdef x3 hex 'fea5' // KHAH stringdef x4 hex 'fea6' // KHAH stringdef d1 hex 'fea9' // DAL stringdef d2 hex 'feaa' // DAL stringdef dz1 hex 'feab' // THAL stringdef dz2 hex 'feac' // THAL stringdef r1 hex 'fead' // REH stringdef r2 hex 'feae' // REH stringdef z1 hex 'feaf' // ZAIN stringdef z2 hex 'feb0' // ZAIN stringdef s1 hex 'feb3' // SEEN stringdef s2 hex 'feb4' // SEEN stringdef s3 hex 'feb1' // SEEN stringdef s4 hex 'feb2' // SEEN stringdef sh1 hex 'feb7' // SHEEN stringdef sh2 hex 'feb8' // SHEEN stringdef sh3 hex 'feb5' // SHEEN stringdef sh4 hex 'feb6' // SHEEN stringdef c1 hex 'febb' // SAD stringdef c2 hex 'febc' // SAD stringdef c3 hex 'feb9' // SAD stringdef c4 hex 'feba' // SAD stringdef dh1 hex 'febf' // DAD stringdef dh2 hex 'fec0' // DAD stringdef dh3 hex 'febd' // DAD stringdef dh4 hex 'febe' // DAD stringdef tt1 hex 'fec3' // TAH stringdef tt2 hex 'fec4' // TAH stringdef tt3 hex 'fec1' // TAH stringdef tt4 hex 'fec2' // TAH stringdef zh1 hex 'fec7' // ZAH stringdef zh2 hex 'fec8' // ZAH stringdef zh3 hex 'fec5' // ZAH stringdef zh4 hex 'fec6' // ZAH stringdef i1 hex 'fecb' // AIN stringdef i2 hex 'fecc' // AIN stringdef i3 hex 'fec9' // AIN stringdef i4 hex 'feca' // AIN stringdef gh1 hex 'fecf' // GHAIN stringdef gh2 hex 'fed0' // GHAIN stringdef gh3 hex 'fecd' // GHAIN stringdef gh4 hex 'fece' // GHAIN stringdef f1 hex 'fed3' // FEH stringdef f2 hex 'fed4' // FEH stringdef f3 hex 'fed1' // FEH stringdef f4 hex 'fed2' // FEH stringdef q1 hex 'fed7' // QAF stringdef q2 hex 'fed8' // QAF stringdef q3 hex 'fed5' // QAF stringdef q4 hex 'fed6' // QAF stringdef k1 hex 'fedb' // KAF stringdef k2 hex 'fedc' // KAF stringdef k3 hex 'fed9' // KAF stringdef k4 hex 'feda' // KAF stringdef l1 hex 'fedf' // LAM stringdef l2 hex 'fee0' // LAM stringdef l3 hex 'fedd' // LAM stringdef l4 hex 'fede' // LAM stringdef m1 hex 'fee3' // MEEM stringdef m2 hex 'fee4' // MEEM stringdef m3 hex 'fee1' // MEEM stringdef m4 hex 'fee2' // MEEM stringdef n1 hex 'fee7' // NOON stringdef n2 hex 'fee8' // NOON stringdef n3 hex 'fee5' // NOON stringdef n4 hex 'fee6' // NOON stringdef e1 hex 'feeb' // HEH stringdef e2 hex 'feec' // HEH stringdef e3 hex 'fee9' // HEH stringdef e4 hex 'feea' // HEH stringdef w1 hex 'feed' // WAW stringdef w2 hex 'feee' // WAW stringdef a_1 hex 'feef' // ALEF_MAKSURA stringdef a_2 hex 'fef0' // ALEF_MAKSURA stringdef y1 hex 'fef3' // YEH stringdef y2 hex 'fef4' // YEH stringdef y3 hex 'fef1' // YEH stringdef y4 hex 'fef2' // YEH // Ligatures Lam-Alef stringdef la hex 'fefb' // LAM_ALEF stringdef la2 hex 'fefc' // LAM_ALEF stringdef lao hex 'fef7' // LAM_ALEF_HAMZA_ABOVE stringdef lao2 hex 'fef8' // LAM_ALEF_HAMZA_ABOVE stringdef lao_ hex 'fef9' // LAM_ALEF_HAMZA_BELOW stringdef lao_2 hex 'fefa' // LAM_ALEF_HAMZA_BELOW stringdef la~ hex 'fef5' // LAM_ALEF_MADDA_ABOVE stringdef la~2 hex 'fef6' // LAM_ALEF_MADDA_ABOVE integers ( word_len ) booleans ( is_noun is_verb is_defined ) routines ( Prefix_Step1 Prefix_Step2 Prefix_Step3a_Noun Prefix_Step3b_Noun Prefix_Step3_Verb Prefix_Step4_Verb Suffix_All_alef_maqsura Suffix_Noun_Step1a Suffix_Noun_Step1b Suffix_Noun_Step2a Suffix_Noun_Step2b Suffix_Noun_Step2c1 Suffix_Noun_Step2c2 Suffix_Noun_Step3 Suffix_Verb_Step1 Suffix_Verb_Step2a Suffix_Verb_Step2b Suffix_Verb_Step2c Normalize_post Normalize_pre Checks1 ) externals ( stem ) groupings ( ) // Normalizations define Normalize_pre as ( loop len ( ( [substring] among ( '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization '{_}' ( delete ) // strip kasheeda // Ponctuation marks '.' ',' ';' ':' '?' '!' '/' '*' '%' '\' '"' ( delete) // General '{,}' '{;}' '{?}' ( delete ) // Arabic-specific // Hindu–Arabic numerals '{0}' ( <- '0') '{1}' ( <- '1') '{2}' ( <- '2') '{3}' ( <- '3') '{4}' ( <- '4') '{5}' ( <- '5') '{6}' ( <- '6') '{7}' ( <- '7') '{8}' ( <- '8') '{9}' ( <- '9') '{%}' '{.}' '{'}' ( delete ) // Shaped forms '{o1}' ( <- '{o}' ) // HAMZA '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA '{a1}' '{a2}' ( <- '{a}' ) // ALEF '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH '{d1}' '{d2}' ( <- '{d}' ) // DAL '{dz1}''{dz2}' ( <- '{dz}' ) // THAL '{r1}' '{r2}'( <- '{r}' ) // REH '{z1}' '{z2}' ( <- '{z}' ) // ZAIN '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH '{w1}' '{w2}' ( <- '{w}' ) // WAW '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH // Ligatures Lam-Alef '{la}' '{la2}' (<- '{l}{a}') '{lao}' '{lao2}' (<- '{l}{ao}') '{lao_}' '{lao_2}' (<- '{l}{ao_}') '{la~}' '{la~2}' (<- '{l}{a~}') ) ) or next ) ) define Normalize_post as ( do ( // normalize last hamza backwards ( [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{o}') '{wo}' ( <- '{o}') '{yo}' ( <- '{o}') ) ) ) do loop word_len ( ( // normalize other hamza's [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{a}') '{wo}' ( <- '{w}') '{yo}' ( <- '{y}') ) ) or next ) ) // Checks define Checks1 as ( $word_len = len [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($word_len > 4 set is_noun unset is_verb set is_defined) '{l}{l}' '{a}{l}' ($word_len > 3 set is_noun unset is_verb set is_defined) ) ) //prefixes define Prefix_Step1 as ( $word_len = len [substring] among ( '{ao}{ao}' ($word_len > 3 <- '{ao}' ) '{ao}{a~}' ($word_len > 3 <- '{a~}' ) '{ao}{wo}' ($word_len > 3 <- '{ao}' ) '{ao}{a}' ($word_len > 3 <- '{a}' ) '{ao}{ao_}' ($word_len > 3 <- '{ao_}' ) // '{ao}' ($word_len > 3 delete) //rare case ) ) define Prefix_Step2 as ( $word_len = len not '{f}{a}' not '{w}{a}' [substring] among ( '{f}' ($word_len > 3 delete) '{w}' ($word_len > 3 delete) ) ) define Prefix_Step3a_Noun as ( // it is noun and defined $word_len = len [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($word_len > 5 delete) '{l}{l}' '{a}{l}' ($word_len > 4 delete) ) ) define Prefix_Step3b_Noun as ( // probably noun and defined $word_len = len not '{b}{a}' // exception [substring] among ( '{b}' ($word_len > 3 delete) // '{k}' '{l}' ($word_len > 3 delete) // BUG: cause confusion '{b}{b}' ($word_len > 3 <- '{b}' ) '{k}{k}' ($word_len > 3 <- '{k}' ) ) ) define Prefix_Step3_Verb as ( $word_len = len [substring] among ( //'{s}' ($word_len > 4 delete)// BUG: cause confusion '{s}{y}' ($word_len > 4 <- '{y}' ) '{s}{t}' ($word_len > 4 <- '{t}') '{s}{n}' ($word_len > 4 <- '{n}') '{s}{ao}' ($word_len > 4 <- '{ao}') ) ) define Prefix_Step4_Verb as ( $word_len = len [substring] among ( '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($word_len > 4 set is_verb unset is_noun <- '{a}{s}{t}' ) ) ) // suffixes backwardmode ( define Suffix_Noun_Step1a as ( $word_len = len [substring] among ( '{y}' '{k}' '{e}' ($word_len >= 4 delete) '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($word_len >= 5 delete) '{k}{m}{a}' '{e}{m}{a}' ($word_len >= 6 delete) ) ) define Suffix_Noun_Step1b as ( $word_len = len [substring] among ( '{n}' ($word_len > 5 delete) ) ) define Suffix_Noun_Step2a as ( $word_len = len [substring] among ( '{a}' '{y}' '{w}' ($word_len > 4 delete) ) ) define Suffix_Noun_Step2b as ( $word_len = len [substring] among ( '{a}{t}' ($word_len >= 5 delete) ) ) define Suffix_Noun_Step2c1 as ( $word_len = len [substring] among ( '{t}' ($word_len >= 4 delete) ) ) define Suffix_Noun_Step2c2 as ( // feminine t_ $word_len = len [substring] among ( '{t_}' ($word_len >= 4 delete) ) ) define Suffix_Noun_Step3 as ( // ya' nisbiya $word_len = len [substring] among ( '{y}' ($word_len >= 3 delete) ) ) define Suffix_Verb_Step1 as ( $word_len = len [substring] among ( '{e}' '{k}' ($word_len >= 4 delete) '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($word_len >= 5 delete) '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($word_len >= 6 delete) ) ) define Suffix_Verb_Step2a as ( $word_len = len [substring] among ( '{t}' ($word_len >= 4 delete) '{a}' '{n}' '{y}' ($word_len >= 4 delete) '{n}{a}' '{t}{a}' '{t}{n}' ($word_len >= 5 delete)// past '{a}{n}' '{w}{n}' '{y}{n}' ($word_len > 5 delete) // present '{t}{m}{a}' ($word_len >= 6 delete) ) ) define Suffix_Verb_Step2b as ( $word_len = len [substring] among ( '{w}{a}' '{t}{m}' ($word_len >= 5 delete) // len >= 5 ) ) define Suffix_Verb_Step2c as ( $word_len = len [substring] among ( '{w}' ($word_len >= 4 delete) '{t}{m}{w}' ($word_len >= 6 delete) ) ) define Suffix_All_alef_maqsura as ( $word_len = len [substring] among ( '{a_}' ( <- '{y}' ) // spell error // '{a_}' ( delete ) // if noun > 3 // '{a_}' ( <- '{a}') // if verb ) ) ) define stem as ( // set initial values set is_noun set is_verb unset is_defined // guess type and properties do Checks1 // normalization pre-stemming do Normalize_pre backwards ( do ( //Suffixes for verbs ( is_verb ( ( (atleast 1 Suffix_Verb_Step1) ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next) ) or Suffix_Verb_Step2b or Suffix_Verb_Step2a ) ) //Suffixes for nouns or ( is_noun ( try ( Suffix_Noun_Step2c2 or (not is_defined Suffix_Noun_Step1a ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1 or next)) or (Suffix_Noun_Step1b ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1)) or (not is_defined Suffix_Noun_Step2a) or (Suffix_Noun_Step2b) ) Suffix_Noun_Step3 ) ) // Suffixes for alef maqsura or Suffix_All_alef_maqsura ) ) //Prefixes do ( try Prefix_Step1 try Prefix_Step2 ( Prefix_Step3a_Noun or (is_noun Prefix_Step3b_Noun) or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb) ) ) // normalization post-stemming do Normalize_post )