/** * Copyright 2017 Gerald Haesendonck * Licensed under the Academic Free License version 3.0 **/ // Parsing is based on https://www.w3.org/TR/n-triples/#n-triples-grammar use std::char::from_u32; use super::{Literal, TypeLang, Predicate, Subject, Object, Triple}; #[pub] // added triple_line -> Option = t: triple { Some(t) } / WS* COMMENT? { None } #[pub] //[2] triple -> Triple = WS* s: subject WS* p: predicate WS* o: object WS* '.' WS* COMMENT? { Triple {subject: s, predicate: p, object: o} } #[pub] //[3] subject -> Subject = iri: IRIREF { Subject::IriRef(iri) } / bnode: BLANK_NODE_LABEL { Subject::BNode(String::from(bnode)) } #[pub] //[4] predicate -> Predicate = iri: IRIREF { Predicate::IriRef(iri) } #[pub] // 5 object -> Object = iri: IRIREF { Object::IriRef(iri) } / bnode: BLANK_NODE_LABEL { Object::BNode(String::from(bnode)) } / lit: literal { Object::Lit(lit) } #[pub] //[6] literal -> Literal = lit: STRING_LITERAL_QUOTE tl: (type_lang)? { let type_lang = match tl { Some(type_lang) => type_lang, None => TypeLang::Type(String::from("https://www.w3.org/2001/XMLSchema#string")) // a literal without type defaults to string }; Literal {data: lit, data_type: type_lang} } // intermediate rule to determine the data type or the language type_lang -> TypeLang = '^^' iri: IRIREF { TypeLang::Type(iri) } / lang: LANGTAG { TypeLang::Lang(lang) } ///// TERMINALS ///// #[pub] //[144s] LANGTAG -> String = '@' lang: $([a-zA-Z]+ ('-' [a-zA-Z0-9]+)*) { String::from(lang) } // not necessary ? //#[pub] //[7] //EOL // = '[\r\n]+' #[pub] //[8] IRIREF -> String = '<' all: ( c1: $([^\u{0000}-\u{0020}<>"{}|^`\\]) {c1.chars().next().unwrap()} / UCHAR)* '>' { all.into_iter().collect() } #[pub] //[9] STRING_LITERAL_QUOTE -> String = '"' all: (ECHAR / UCHAR / c1: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) {c1.chars().next().unwrap()})* '"' { all.into_iter().collect() } #[pub] //[141s] BLANK_NODE_LABEL -> &'input str //= match_str: $('_:' (PN_CHARS_U / [0-9]) ((PN_CHARS / '.')* PN_CHARS)? ) { match_str } = '_:' bnode_label: $((PN_CHARS_U / [0-9]) (PN_CHARS / '.')* ) { bnode_label } // TODO: this is wrong, but how to solve? #[pub] // 10 UCHAR -> char = '\\u' hexNr: $(HEX HEX HEX HEX) { u32::from_str_radix(hexNr, 16).ok().and_then(from_u32).unwrap() } / '\\U' hexNr: $(HEX HEX HEX HEX HEX HEX) // should be 8 HEXes, but rust can only handle 6... { u32::from_str_radix(hexNr, 16).ok().and_then(from_u32).unwrap() } // 153s ECHAR -> char = '\\' escChar: $([tbnrf"'\\]) { match escChar { "t" => '\u{0009}', "b" => '\u{0008}', "n" => '\u{000A}', "r" => '\u{000D}', "f" => '\u{000C}', "\"" => '\u{0022}', "'" => '\u{0027}', "\\" => '\u{005C}', _ => '\u{0000}' // cannot occur. } } // 157s PN_CHARS_BASE -> char = c1: $([a-zA-Z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]) {c1.chars().next().unwrap()} //\u{10000}-\u{EFFFF}] see http://doc.rust-lang.org/rustc_unicode/char/ why not // 158s PN_CHARS_U -> char = PN_CHARS_BASE / '_' {'_'} / ':' {'_'} // 160s PN_CHARS -> char = PN_CHARS_U / c2: $([\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}]) {c2.chars().next().unwrap()} // 162s HEX = [0-9a-fA-F] // added WS = [ \t] COMMENT = '#' .*