/* Copyright (C) 2021 Kunal Mehta Copyright (C) 2021 Erutuon This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #![cfg(feature = "parsing")] use cached::proc_macro::cached; use mwtitle::{Error, SiteInfoResponse, TitleCodec}; #[cached( sync_writes = true, key = "String", convert = r#"{ String::from(domain) }"# )] async fn codec(domain: &str) -> TitleCodec { let url = format!( "https://{domain}/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|interwikimap&formatversion=2&format=json" ); let resp: SiteInfoResponse = reqwest::get(url).await.unwrap().json().await.unwrap(); TitleCodec::from_site_info(resp.query).expect( "API doesn't return namespacealiases with invalid namespace IDs", ) } async fn test_failure(domain: &str, tests: [(&str, &str); N]) { let codec = codec(domain).await; for (input, expected) in tests { let title = codec.new_title(input); assert!(title.is_err(), "\n{title:?}\n{expected:?}"); let error_code = title .as_ref() .err() .and_then(|error| error.mw_title_codec_error_code()); assert_eq!(error_code, Some(expected), "\n{title:?}\n{expected:?}"); } } // Test cases initially copied from mediawiki-title (npm package) // Subject of NS_TALK does not roundtrip to NS_MAIN. #[tokio::test] #[cfg_attr(miri, ignore)] async fn titles_that_cannot_roundtrip_from_talk_to_main_and_back_are_rejected() { test_failure( "en.wikipedia.org", [ ("Talk:File:Example.svg", "title-invalid-talk-namespace"), ("Talk:_File_:Example.svg", "title-invalid-talk-namespace"), ("Talk:wikt:Example.svg", "title-invalid-talk-namespace"), ("Talk: wikt :Example.svg", "title-invalid-talk-namespace"), ], ) .await; } // Unicode characters that are both control characters and whitespace, // and are forbidden regardless of `wgLegalTitleChars`. #[tokio::test] #[cfg_attr(miri, ignore)] async fn c0_control_character_whitespace_is_rejected() { test_failure( "en.wikipedia.org", [ ("A\t", "title-invalid-characters"), ("A\n", "title-invalid-characters"), ("A\r", "title-invalid-characters"), ("A\tB", "title-invalid-characters"), ("Talk:A\t", "title-invalid-characters"), ("Talk:\tA", "title-invalid-characters"), ("Talk\t:A", "title-invalid-characters"), ("Talk:A\t/B", "title-invalid-characters"), ("Talk:A/\tB", "title-invalid-characters"), ("Talk:A/B\t/C", "title-invalid-characters"), ], ) .await; } // U+0085 is considered whitespace by Unicode but not by `TitleCodec`. #[tokio::test] #[cfg_attr(miri, ignore)] async fn non_mediawiki_whitespace_is_not_collapsed_or_trimmed() { test_success("en.wikipedia.org", [("A \u{85} ", "A_\u{85}")]).await; } // Punctuation characters forbidden regardless of `wgLegalTitleChars`. #[tokio::test] #[cfg_attr(miri, ignore)] async fn punctuation_characters_used_in_wikitext_or_html_are_rejected() { test_failure( "en.wikipedia.org", [ ("A [ B", "title-invalid-characters"), ("A ] B", "title-invalid-characters"), ("A { B", "title-invalid-characters"), ("A } B", "title-invalid-characters"), ("A < B", "title-invalid-characters"), ("A > B", "title-invalid-characters"), ("A | B", "title-invalid-characters"), ], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn url_percent_encoding_is_rejected() { test_failure( "en.wikipedia.org", [ ("A%20B", "title-invalid-characters"), ("A%23B", "title-invalid-characters"), ("A%2523B", "title-invalid-characters"), ], ) .await; } // Directory navigation #[tokio::test] #[cfg_attr(miri, ignore)] async fn relative_directory_syntax_is_rejected() { test_failure( "en.wikipedia.org", [ (".", "title-invalid-relative"), ("..", "title-invalid-relative"), ("./Sandbox", "title-invalid-relative"), ("../Sandbox", "title-invalid-relative"), ("Foo/./Sandbox", "title-invalid-relative"), ("Foo/../Sandbox", "title-invalid-relative"), ("Sandbox/.", "title-invalid-relative"), ("Sandbox/..", "title-invalid-relative"), ], ) .await; } // Namespace prefix without actual title #[tokio::test] #[cfg_attr(miri, ignore)] async fn empty_database_page_titles_are_rejected() { test_failure( "en.wikipedia.org", [ ("", "title-invalid-empty"), (":", "title-invalid-empty"), ("__ __", "title-invalid-empty"), (" __ ", "title-invalid-empty"), ("Talk:", "title-invalid-empty"), ("Talk: _", "title-invalid-empty"), ("Talk:#", "title-invalid-empty"), ("Category: ", "title-invalid-empty"), ("Category: #bar", "title-invalid-empty"), ], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn replacement_character_is_rejected() { test_failure("en.wikipedia.org", [("�", "title-invalid-utf8")]).await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn magic_tilde_is_rejected() { test_failure( "en.wikipedia.org", [ ("A ~~~ Name", "title-invalid-magic-tilde"), ("A ~~~~ Signature", "title-invalid-magic-tilde"), ("A ~~~~~ Timestamp", "title-invalid-magic-tilde"), ], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn leading_colon_is_rejected() { test_failure( "en.wikipedia.org", [ ("::1", "title-invalid-leading-colon"), ("Category::Test", "title-invalid-leading-colon"), ], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn title_length_constraints_apply_to_database_page_title() { let codec = codec("en.wikipedia.org").await; // Test cases initially copied from mediawiki-title (npm package) let test_good = [ // Length is 256 total, but only title part matters format!("Category:{}", "x".repeat(247)), // Special pages can have longer titles format!("Special:{}", "x".repeat(499)), "x".repeat(251), "x".repeat(255), // Rust doesn't allow surrogates in strings. // repeat("\u{d83c}\u{df40}", 63), ]; let test_bad = [ "x".repeat(257), format!("Special:{}", "x".repeat(513)), // Rust doesn't allow surrogates in strings. // repeat("\u{d83c}\u{df40}", 64) ]; for input in test_good { let title_result = codec.new_title(&input); assert!(title_result.is_ok(), "\n{title_result:?}"); } for input in test_bad { let title_result = codec.new_title(&input); assert!( matches!(title_result, Err(Error::TooLong(_))), "\n{title_result:?}" ); } } async fn test_no_change_with_whitespace_added( domain: &str, tests: [&str; N], ) { let codec = codec(domain).await; for input in tests { let title = codec.new_title(input).ok(); let title2 = codec.new_title(&format!(" {input}_")).ok(); assert_eq!(title, title2); } } #[tokio::test] #[cfg_attr(miri, ignore)] async fn more_than_two_dots_are_accepted() { test_no_change_with_whitespace_added( "en.wikipedia.org", ["Foo/.../Sandbox", "Sandbox/..."], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn one_or_two_tildes_are_accepted() { test_no_change_with_whitespace_added("en.wikipedia.org", ["~", "A~~"]) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn title_parses_identically_after_leading_and_trailing_whitespace_is_added( ) { test_no_change_with_whitespace_added("en.wikipedia.org", [ "Sandbox", "A \"B\"", "A 'B'", ".com", // We are not supporting standalone fragments // "#", "Test#Abc", "\"", "'", "Talk:Sandbox", "Talk:Foo:Sandbox", "File:Example.svg", "File_talk:Example.svg", ":A", "-", "aũ", "Foo & bar", "\"Believing_Women\"_in_Islam._Unreading_Patriarchal_Interpretations_of_the_Qur\\\'ān" ]).await; } async fn test_success(domain: &str, tests: [(&str, &str); N]) { let codec = codec(domain).await; for (input, expected) in tests { let title_result = codec.new_title(input); assert!(title_result.is_ok(), "{title_result:?}\t{expected:?}"); assert_eq!( title_result .ok() .map(|title| codec.to_underscores(&title)) .as_deref(), Some(expected) ); } } #[tokio::test] #[cfg_attr(miri, ignore)] async fn whitespace_is_trimmed_and_leading_colons_are_removed() { test_success( "en.wikipedia.org", [ ("Test", "Test"), (":Test", "Test"), (": Test", "Test"), (":_Test_", "Test"), ("Test 123 456 789", "Test_123_456_789"), ("💩", "💩"), ("Talk: foo", "Talk:Foo"), ("X-Men (film series) #Gambit", "X-Men_(film_series)"), ("Foo _ bar", "Foo_bar"), ( "Foo \u{00A0}\u{1680}\u{180E}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000} bar", "Foo_bar"), ( "Foo\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}bar", "Foobar", ), ( "list of Neighbours characters (2016)#Tom Quill", "List_of_Neighbours_characters_(2016)" ), ]).await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn titles_with_fake_namespaces_arent_trimmed_or_capitalized_after_colon() { test_success( "en.wikipedia.org", [ ("Foo:bar", "Foo:bar"), ("Foo: bar", "Foo:_bar"), ("int:eger", "Int:eger"), ], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn namespace_aliases_map_to_local_namespace_names() { test_success("en.wikipedia.org", [("WP:eger", "Wikipedia:Eger")]).await; } macro_rules! add_subpage { ( [ $(($input:literal, $expected:literal)),* $(,)? ] ) => {{ [ $( ($input, $expected), (concat!($input, "/subpage"), concat!($expected, "/subpage")) ),* ] }}; } // Test cases initially copied from mediawiki-title (npm package) #[tokio::test] #[cfg_attr(miri, ignore)] async fn ipv4_address_is_sanitized_in_user_namespaces() { test_success( "en.wikipedia.org", add_subpage!([ ("User:127.000.000.001", "User:127.0.0.1"), ("User:0.0.0.0", "User:0.0.0.0"), ("User:00.00.00.00", "User:0.0.0.0"), ("User:000.000.000.000", "User:0.0.0.0"), ("User:141.000.011.253", "User:141.0.11.253"), ("User: 1.2.4.5", "User:1.2.4.5"), ("User:01.02.04.05", "User:1.2.4.5"), ("User:001.002.004.005", "User:1.2.4.5"), ("User:010.0.000.1", "User:10.0.0.1"), ("User:080.072.250.04", "User:80.72.250.4"), ]), ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn ipv6_address_is_sanitized_in_user_namespaces() { test_success( "en.wikipedia.org", add_subpage!([ ("User:::1", "User:0:0:0:0:0:0:0:1"), ("User:0:0:0:0:0:0:0:1", "User:0:0:0:0:0:0:0:1"), ("User:cebc:2004:f::", "User:CEBC:2004:F:0:0:0:0:0"), ("User:::", "User:0:0:0:0:0:0:0:0"), ("User:0:0:0:1::", "User:0:0:0:1:0:0:0:0"), ("User:3f:535::e:fbb", "User:3F:535:0:0:0:0:E:FBB"), ("User Talk:::1", "User_talk:0:0:0:0:0:0:0:1"), ("User_Talk:::1", "User_talk:0:0:0:0:0:0:0:1"), ("User_talk:::1", "User_talk:0:0:0:0:0:0:0:1"), ("User_talk:::1/24", "User_talk:0:0:0:0:0:0:0:1/24"), ]), ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn non_ip_addresses_are_not_sanitized() { test_success( "en.wikipedia.org", add_subpage!([ ("User:Bar.01", "User:Bar.01"), ("User:Bar.010", "User:Bar.010"), ("User:00.00.00. 00", "User:00.00.00._00"), // No sanitization if there's a space before the slash. ("User:00.00.00.00 / subpage", "User:00.00.00.00_/_subpage"), ]), ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn ip_addresses_outside_of_user_namespaces_are_not_sanitized() { test_success( "en.wikipedia.org", add_subpage!([ ("0:0:0:0:0:0:0:1", "0:0:0:0:0:0:0:1"), ("127.000.000.001", "127.000.000.001"), ("0.0.0.0", "0.0.0.0"), ("00.00.00.00", "00.00.00.00"), ("000.000.000.000", "000.000.000.000"), ("141.000.011.253", "141.000.011.253"), (" 1.2.4.5", "1.2.4.5"), ("01.02.04.05", "01.02.04.05"), ("001.002.004.005", "001.002.004.005"), ("010.0.000.1", "010.0.000.1"), ("080.072.250.04", "080.072.250.04"), ("Foo.1000.00", "Foo.1000.00"), ("Bar.01", "Bar.01"), ("Bar.010", "Bar.010"), ("cebc:2004:f::", "Cebc:2004:f::"), ("0:0:0:1::", "0:0:0:1::"), ("3f:535::e:fbb", "3f:535::e:fbb"), ]), ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn title_capitalization_follows_php_rules() { test_success( "en.wikipedia.org", [ ("ß", "ß"), ("ʼn", "ʼn"), ("ǰ", "ǰ"), ("ΐ", "ΐ"), ("ΰ", "ΰ"), ("և", "և"), ("ẖ", "ẖ"), ("ẗ", "ẗ"), ("ẘ", "ẘ"), ("ẙ", "ẙ"), ("ẚ", "ẚ"), ("ὐ", "ὐ"), ("ὒ", "ὒ"), ("ὔ", "ὔ"), ("ὖ", "ὖ"), ("ᾀ", "ᾈ"), ("ᾁ", "ᾉ"), ("ᾂ", "ᾊ"), ("ᾃ", "ᾋ"), ("ᾄ", "ᾌ"), ("ᾅ", "ᾍ"), ("ᾆ", "ᾎ"), ("ᾇ", "ᾏ"), ("ᾐ", "ᾘ"), ("ᾑ", "ᾙ"), ("ᾒ", "ᾚ"), ("ᾓ", "ᾛ"), ("ᾔ", "ᾜ"), ("ᾕ", "ᾝ"), ("ᾖ", "ᾞ"), ("ᾗ", "ᾟ"), ("ᾠ", "ᾨ"), ("ᾡ", "ᾩ"), ("ᾢ", "ᾪ"), ("ᾣ", "ᾫ"), ("ᾤ", "ᾬ"), ("ᾥ", "ᾭ"), ("ᾦ", "ᾮ"), ("ᾧ", "ᾯ"), ("ff", "ff"), ("fi", "fi"), ("fl", "fl"), ("ffi", "ffi"), ("ffl", "ffl"), ("ſt", "ſt"), ("st", "st"), ("ﬓ", "ﬓ"), ("ﬔ", "ﬔ"), ("ﬕ", "ﬕ"), ("ﬖ", "ﬖ"), ("ﬗ", "ﬗ"), ("ⓝ", "ⓝ"), ], ) .await; } /// Special handling for `i` first character #[tokio::test] #[cfg_attr(miri, ignore)] async fn dotted_i_is_uppercased_to_dotted_capital_i_according_to_language() { let tests = [ ("tr.wikipedia.org", "iTestTest", "İTestTest"), ("az.wikipedia.org", "iTestTest", "İTestTest"), ("kk.wikipedia.org", "iTestTest", "İTestTest"), ("kaa.wikipedia.org", "iTestTest", "İTestTest"), ("en.wikipedia.org", "iTestTest", "ITestTest"), ]; for (domain, input, expected) in tests { let codec = codec(domain).await; let prefixed_text = codec .new_title(input) .map(|title| codec.to_pretty(&title)) .ok(); assert_eq!(prefixed_text.as_deref(), Some(expected)); } } #[tokio::test] #[cfg_attr(miri, ignore)] async fn interwiki_prefix_is_normalized() { test_success( "en.wikipedia.org", [("meta:foobar", "meta:foobar"), ("Meta:foo", "meta:foo")], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn title_with_local_interwiki_is_capitalized_according_to_namespace() { test_success( "en.wikipedia.org", [ ("w:talk:foo", "Talk:Foo"), ("w:spaces in page", "Spaces_in_page"), ("en:w:Sandbox_", "Sandbox"), ("en:", "Main_Page"), ], ) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn local_interwiki_with_mainspace_and_empty_title_points_to_main_page() { test_success( "en.wikipedia.org", [("en:", "Main_Page"), ("w:", "Main_Page")], ) .await; let codec = codec("en.wikipedia.org").await; for empty_title in ["w:", "w:en:", "en:w:"] { assert_eq!( codec .new_title(empty_title) .map(|title| codec.to_pretty(&title)) .ok() .as_deref(), Some("Main Page"), ); } } #[tokio::test] #[cfg_attr(miri, ignore)] async fn local_interwiki_with_namespace_and_empty_dbkey_is_rejected() { test_failure("en.wikipedia.org", [("w: Talk:", "title-invalid-empty")]) .await; } #[tokio::test] #[cfg_attr(miri, ignore)] async fn default_namespace_is_assigned_to_title_without_namespace_prefix() { const NS_TEMPLATE: i32 = 10; let codec = codec("en.wikipedia.org").await; for template_title in [ "lang", "Lang", "Template:lang", "Template:Lang", "w:template:lang", "w:en:template:lang", "en:w:template:lang", ] { assert_eq!( codec .new_title_with_namespace(template_title, NS_TEMPLATE) .map(|title| codec.to_pretty(&title)) .ok() .as_deref(), Some("Template:Lang"), ); } } #[tokio::test] #[cfg_attr(miri, ignore)] async fn test_to_pretty_with_fragment() { let codec = codec("en.wikipedia.org").await; assert_eq!( codec .new_title("Main_Page#Did_you_know_...") .map(|title| codec.to_pretty_with_fragment(&title)) .ok() .as_deref(), Some("Main Page#Did you know ..."), ); } #[tokio::test] #[cfg_attr(miri, ignore)] async fn formatversion_one() { let url = "https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|interwikimap&formatversion=1&format=json"; let resp: SiteInfoResponse = reqwest::get(url).await.unwrap().json().await.unwrap(); TitleCodec::from_site_info(resp.query).unwrap(); } #[tokio::test] #[cfg_attr(miri, ignore)] async fn test_new_title_from_database() { let codec = codec("en.wikipedia.org").await; assert_eq!( codec.to_pretty(&codec.new_title_from_database(0, "foo").unwrap()), "Foo" ); assert_eq!( codec.to_pretty(&codec.new_title_from_database(1, "foo").unwrap()), "Talk:Foo" ); // Invalid namespace is an error let err = codec.new_title_from_database(-100, "foo").unwrap_err(); assert!(matches!(err, Error::UnknownNamespace(_))); }