/*------------------------------------------------------------------------ Copyright (C) 2002-2016 SIL International. All rights reserved. Distributable under the terms of either the Common Public License or the GNU Lesser General Public License, as specified in the LICENSING.txt file. File: TECkit_Format.h Responsibility: Jonathan Kew Last reviewed: Not yet. Description: Definitions used in the TECkit binary table format 2006-06-02 jk added support for extended string rules (>255 per initial char) -------------------------------------------------------------------------*/ #ifndef __TECkit_Format_H__ #define __TECkit_Format_H__ #include "teckit-Common.h" #define kMagicNumber 0x714d6170 /* 'qMap' */ #define kMagicNumberCmp 0x7a516d70 /* 'zQmp' */ #define kFileVersion2_1 0x00020001 /* version before tables with ExtStringRules */ #define kCurrentFileVersion 0x00030000 /* current version */ #define kTableVersion2 0x00020000 #define kCurrentTableVersion 0x00030000 /* actually, the engine doesn't check this, it only looks at the file version */ struct NameRec { UInt16 nameID; UInt16 nameLength; /* Byte data[nameLength]; pad to 2-byte boundary */ }; struct FileHeader { UInt32 type; /* magic number = 'qMap' */ UInt32 version; /* version = kFileCurrentVersion */ UInt32 headerLength; /* length of this header including offset arrays and name records */ UInt32 formFlagsLHS; /* flags for normalization form, Unicode/byte encoding on LHS of mapping */ UInt32 formFlagsRHS; /* flags for normalization form, Unicode/byte encoding on RHS of mapping */ UInt32 numNames; /* number of strings in the names table */ UInt32 numFwdTables; /* number of tables in forward pipeline */ UInt32 numRevTables; /* number of tables in reverse pipeline */ }; struct TableHeader { UInt32 type; /* type = 'B->B', 'B->U', 'U->B', 'U->U' */ /* or type = 'NFC ', 'NFD ', and no additional header fields are present */ UInt32 version; /* version = kCurrentTableVersion */ UInt32 length; /* total length of this table */ UInt32 flags; /* flags: 0x00000001: supplementary-plane Unicode characters supported in mapping and classes 0x00000002: DBCS support (BB/BU tables only) in lookup table */ UInt32 pageBase; /* offset from table header to page table (Ux tables) or dbcsPage table (Bx tables) */ UInt32 lookupBase; /* offset from table header to lookup table(s) */ UInt32 matchClassBase; /* offset from table header to match class definitions */ UInt32 repClassBase; /* offset from table header to replacement class definitions */ UInt32 stringListBase; /* offset from table header to string rule lists */ UInt32 stringRuleData; /* offset from table header to string rule data */ UInt8 maxMatch; /* max number of input code units matched by a rule */ UInt8 maxPre; /* max number of input code units matched by pre-context */ UInt8 maxPost; /* max number of input code units matched by post-context */ UInt8 maxOutput; /* max number of output code units generated by a rule */ UInt32 replacementChar; /* default output for unmapped codes */ }; #ifndef __cplusplus typedef struct TableHeader TableHeader; #endif #define kTableType_BB 0x422d3e42 #define kTableType_BU 0x422d3e55 #define kTableType_UB 0x552d3e42 #define kTableType_UU 0x552d3e55 #define kTableType_NFC 0x4e464320 #define kTableType_NFD 0x4e464420 #define kTableFlags_Supplementary 0x0001 #define kTableFlags_DBCS 0x0002 union Lookup { /* for any table when string rules are used */ struct { UInt8 type; /* 0xff: use string rules 0xfe: illegal DBCS trailing byte 0xfd: unmapped character: copy (BB/UU) or output default (UB/BU) 0x00-0x03: direct lookup */ UInt8 ruleCount; /* number of rules for this code */ UInt16 ruleIndex; /* index into stringList of start of rule list for this code */ } rules; /* for UB and BB tables with direct byte output */ struct { UInt8 count; /* count of bytes present in data[]: 0-3 */ UInt8 data[3]; } bytes; /* for BU and UU tables with direct Unicode output */ UInt32 usv; /* unicode scalar value */ }; #ifndef __cplusplus typedef union Lookup Lookup; #endif #define kLookupType_StringRules 0xff #define kLookupType_IllegalDBCS 0xfe #define kLookupType_Unmapped 0xfd #define kLookupType_RuleTypeMask 0xc0 #define kLookupType_ExtStringRules 0x80 #define kLookupType_ExtRuleCountMask 0x3f /* /rules.ruleOffset/ points to an array of /rules.ruleCount/ UInt32 values which are the offsets from stringRuleData to each rule to test for this character */ struct StringRule { UInt8 matchLength; /* length of match string in matchElements */ UInt8 postLength; /* length of post-context in matchElements */ UInt8 preLength; /* length of pre-context in matchElements */ UInt8 repLength; /* length of replacement string in repElements */ }; #ifndef __cplusplus typedef struct StringRule StringRule; #endif union MatchElem { #ifdef __cplusplus MatchElem() { } #endif struct { UInt8 repeat; /* repeat count: (min << 4) + max */ UInt8 type; /* 0x80: negate flag (not allowed with group) 0x40: non-literal flag--if set, bits 0x3f indicate specific type (value must not be zero) Note that if 'non-literal' flag is NOT set, remaining bits are not used as type code but are part of a USV value (or must be set to zero for literal byte data). */ UInt16 reserved; } flags; union { struct { UInt16 reserved; UInt8 dNext; /* offset to following OR or EGroup element */ UInt8 dAfter; /* offset to element after the group for BGroup */ } bgroup; struct { UInt16 reserved; UInt8 dNext; /* offset to following OR or EGroup element (for OR only) */ UInt8 dStart; /* reverse offset to corresponding BGroup */ } egroup; /* (also used for OR elements) */ struct { UInt16 reserved; UInt16 index; /* index of character class */ } cls; struct { UInt8 reserved[3]; UInt8 data; /* literal byte */ } byte; struct { UInt32 data; /* literal Unicode scalar: must mask with kUSVMask, as top bits overlap flags.repeat and "negate" bit in flags.type */ } usv; } value; }; #ifndef __cplusplus typedef union MatchElem MatchElem; #endif #define kMatchElem_Negate 0x80 /* negated test */ #define kMatchElem_NonLit 0x40 /* test value is not a literal character; need to check type */ #define kMatchElem_TypeMask 0x3f /* Mask for type value. Note that type 0 must not be used (=literal) */ #define kMatchElem_Type_Class 0x01 /* class match */ #define kMatchElem_Type_BGroup 0x02 /* begin group */ #define kMatchElem_Type_EGroup 0x03 /* end group */ #define kMatchElem_Type_OR 0x04 /* special code: OR */ #define kMatchElem_Type_ANY 0x05 /* special code: ANY */ #define kMatchElem_Type_EOS 0x06 /* special code: EOS */ #define kMatchElem_Type_Copy 0x07 /* copy matched item (invalid; for internal compiler use) */ #define kUSVMask 0x001fffff union RepElem { struct { UInt8 type; /* see kRepElem_... below */ UInt8 matchIndex; /* index of corresponding item in matchString for type == kRepElem_Class or kRepElem_Copy */ UInt16 repClass; /* repClass if type == kRepElem_Class */ } flags; UInt32 value; /* literal value (mask with kUSVMask) if flags.type == kRepElem_Literal */ }; #ifndef __cplusplus typedef union RepElem RepElem; #endif #define kRepElem_Literal 0x00 #define kRepElem_Class kMatchElem_Type_Class #define kRepElem_Copy kMatchElem_Type_Copy #define kRepElem_Unmapped 0x0f /* used in default terminator rules */ #endif /* __TECkit_Format_H__ */