remove XID and Pattern_White_Space unicode tables from libcore

matklad · matklad · commit a0c186c34f67 · 2019-09-04T13:11:11.000+03:00
They are only used by rustc_lexer, and are not needed elsewhere.

So we move the relevant definitions into rustc_lexer (while the actual
unicode data comes from the unicode-xid crate) and make the rest of
the compiler use it.
diff --git a/Cargo.lock b/Cargo.lock
@@ -1011,6 +1011,7 @@ dependencies = [
 name = "fmt_macros"
 version = "0.0.0"
 dependencies = [
+ "rustc_lexer",
  "syntax_pos",
 ]
 
@@ -2372,7 +2373,7 @@ version = "0.4.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759"
 dependencies = [
- "unicode-xid",
+ "unicode-xid 0.1.0",
 ]
 
 [[package]]
@@ -3290,7 +3291,7 @@ dependencies = [
 name = "rustc_lexer"
 version = "0.1.0"
 dependencies = [
- "unicode-xid",
+ "unicode-xid 0.2.0",
 ]
 
 [[package]]
@@ -3368,6 +3369,7 @@ dependencies = [
  "rustc_apfloat",
  "rustc_data_structures",
  "rustc_errors",
+ "rustc_lexer",
  "rustc_target",
  "serialize",
  "smallvec",
@@ -3976,7 +3978,7 @@ checksum = "641e117d55514d6d918490e47102f7e08d096fdde360247e4a10f7a91a8478d3"
 dependencies = [
  "proc-macro2",
  "quote",
- "unicode-xid",
+ "unicode-xid 0.1.0",
 ]
 
 [[package]]
@@ -3988,7 +3990,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "syn",
- "unicode-xid",
+ "unicode-xid 0.1.0",
 ]
 
 [[package]]
@@ -4017,6 +4019,7 @@ dependencies = [
  "log",
  "rustc_data_structures",
  "rustc_errors",
+ "rustc_lexer",
  "rustc_target",
  "smallvec",
  "syntax",
@@ -4532,6 +4535,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
 
+[[package]]
+name = "unicode-xid"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
+
 [[package]]
 name = "unicode_categories"
 version = "0.1.1"
diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs
@@ -547,29 +547,6 @@ impl char {
         }
     }
 
-    /// Returns `true` if this `char` satisfies the `XID_Start` Unicode property, and false
-    /// otherwise.
-    ///
-    /// `XID_Start` is a Unicode Derived Property specified in
-    /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
-    /// mostly similar to `ID_Start` but modified for closure under `NFKx`.
-    #[unstable(feature = "unicode_internals", issue = "0")]
-    pub fn is_xid_start(self) -> bool {
-        derived_property::XID_Start(self)
-    }
-
-    /// Returns `true` if this `char` satisfies the `XID_Continue` Unicode property, and false
-    /// otherwise.
-    ///
-    /// `XID_Continue` is a Unicode Derived Property specified in
-    /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
-    /// mostly similar to `ID_Continue` but modified for closure under NFKx.
-    #[unstable(feature = "unicode_internals", issue = "0")]
-    #[inline]
-    pub fn is_xid_continue(self) -> bool {
-        derived_property::XID_Continue(self)
-    }
-
     /// Returns `true` if this `char` is lowercase.
     ///
     /// 'Lowercase' is defined according to the terms of the Unicode Derived Core
diff --git a/src/libcore/unicode/mod.rs b/src/libcore/unicode/mod.rs
@@ -13,8 +13,3 @@ pub mod derived_property {
 pub mod conversions {
     pub use crate::unicode::tables::conversions::{to_lower, to_upper};
 }
-
-// For use in libsyntax
-pub mod property {
-    pub use crate::unicode::tables::property::Pattern_White_Space;
-}
diff --git a/src/libcore/unicode/tables.rs b/src/libcore/unicode/tables.rs
diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
@@ -728,7 +728,7 @@ def generate_property_module(mod, grouped_categories, category_subset):
 
     yield "pub(crate) mod %s {\n" % mod
     for cat in sorted(category_subset):
-        if cat in ("Cc", "White_Space", "Pattern_White_Space"):
+        if cat in ("Cc", "White_Space"):
             generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
         else:
             generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
@@ -841,19 +841,18 @@ def main():
     unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
     load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
 
-    want_derived = {"XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
+    want_derived = {"Alphabetic", "Lowercase", "Uppercase",
                     "Cased", "Case_Ignorable", "Grapheme_Extend"}
     derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
 
     props = load_properties(get_path(UnicodeFiles.PROPS),
-                            {"White_Space", "Join_Control", "Noncharacter_Code_Point",
-                             "Pattern_White_Space"})
+                            {"White_Space", "Join_Control", "Noncharacter_Code_Point"})
 
     # Category tables
     for (name, categories, category_subset) in (
             ("general_category", unicode_data.general_categories, ["N", "Cc"]),
             ("derived_property", derived, want_derived),
-            ("property", props, ["White_Space", "Pattern_White_Space"])
+            ("property", props, ["White_Space"])
     ):
         for fragment in generate_property_module(name, categories, category_subset):
             buf.write(fragment)
diff --git a/src/libfmt_macros/Cargo.toml b/src/libfmt_macros/Cargo.toml
@@ -10,4 +10,4 @@ path = "lib.rs"
 
 [dependencies]
 syntax_pos = { path = "../libsyntax_pos" }
-
+rustc_lexer = { path = "../librustc_lexer" }
diff --git a/src/libfmt_macros/lib.rs b/src/libfmt_macros/lib.rs
@@ -23,6 +23,7 @@ use std::string;
 use std::iter;
 
 use syntax_pos::{InnerSpan, Symbol};
+use rustc_lexer::character_properties::{is_id_start, is_id_continue};
 
 #[derive(Copy, Clone)]
 struct InnerOffset(usize);
@@ -597,12 +598,11 @@ impl<'a> Parser<'a> {
         }
     }
 
-    /// Parses a word starting at the current position. A word is considered to
-    /// be an alphabetic character followed by any number of alphanumeric
-    /// characters.
+    /// Parses a word starting at the current position. A word is the same as
+    /// Rust identifier, except that it can't start with `_` character.
     fn word(&mut self) -> &'a str {
         let start = match self.cur.peek() {
-            Some(&(pos, c)) if c.is_xid_start() => {
+            Some(&(pos, c)) if c != '_' && is_id_start(c) => {
                 self.cur.next();
                 pos
             }
@@ -611,7 +611,7 @@ impl<'a> Parser<'a> {
             }
         };
         while let Some(&(pos, c)) = self.cur.peek() {
-            if c.is_xid_continue() {
+            if is_id_continue(c) {
                 self.cur.next();
             } else {
                 return &self.input[start..pos];
diff --git a/src/librustc_lexer/Cargo.toml b/src/librustc_lexer/Cargo.toml
@@ -4,12 +4,12 @@ name = "rustc_lexer"
 version = "0.1.0"
 edition = "2018"
 
-# Note that this crate purposefully does not depend on other rustc crates
-[dependencies]
-unicode-xid = { version = "0.1.0", optional = true }
-
 # Note: do not remove this blank `[lib]` section.
 # This will be used when publishing this crate as `rustc-ap-rustc_lexer`.
 [lib]
 doctest = false
 name = "rustc_lexer"
+
+# Note that this crate purposefully does not depend on other rustc crates
+[dependencies]
+unicode-xid = "0.2.0"
diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs
@@ -1,6 +1,5 @@
-// We want to be able to build this crate with a stable compiler, so feature
-// flags should be optional.
-#![cfg_attr(not(feature = "unicode-xid"), feature(unicode_internals))]
+// We want to be able to build this crate with a stable compiler, so no
+// `#![feature]` attributes should be added.
 
 mod cursor;
 pub mod unescape;
@@ -507,54 +506,52 @@ impl Cursor<'_> {
 }
 
 pub mod character_properties {
-    // this is Pattern_White_Space
-    #[cfg(feature = "unicode-xid")]
+    // See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
+    // classes.
+
+    // This is Pattern_White_Space.
+    //
+    // Note that this set is stable (ie, it doesn't change with different
+    // Unicode versions), so it's ok to just hard-code the values.
     pub fn is_whitespace(c: char) -> bool {
         match c {
-            '\u{0009}' | '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{0020}'
-            | '\u{0085}' | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' => true,
+            // Usual ASCII suspects
+            | '\u{0009}' // \t
+            | '\u{000A}' // \n
+            | '\u{000B}' // vertical tab
+            | '\u{000C}' // form feed
+            | '\u{000D}' // \r
+            | '\u{0020}' // space
+
+            // NEXT LINE from latin1
+            | '\u{0085}'
+
+            // Bidi markers
+            | '\u{200E}' // LEFT-TO-RIGHT MARK
+            | '\u{200F}' // RIGHT-TO-LEFT MARK
+
+            // Dedicated whitespace characters from Unicode
+            | '\u{2028}' // LINE SEPARATOR
+            | '\u{2029}' // PARAGRAPH SEPARATOR
+              => true,
             _ => false,
         }
     }
 
-    #[cfg(not(feature = "unicode-xid"))]
-    pub fn is_whitespace(c: char) -> bool {
-        core::unicode::property::Pattern_White_Space(c)
-    }
-
-    // this is XID_Start OR '_' (which formally is not a XID_Start)
-    #[cfg(feature = "unicode-xid")]
+    // This is XID_Start OR '_' (which formally is not a XID_Start).
     pub fn is_id_start(c: char) -> bool {
         ('a' <= c && c <= 'z')
             || ('A' <= c && c <= 'Z')
             || c == '_'
             || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
     }
 
-    #[cfg(not(feature = "unicode-xid"))]
-    pub fn is_id_start(c: char) -> bool {
-        ('a' <= c && c <= 'z')
-            || ('A' <= c && c <= 'Z')
-            || c == '_'
-            || (c > '\x7f' && c.is_xid_start())
-    }
-
-    // this is XID_Continue
-    #[cfg(feature = "unicode-xid")]
+    // This is XID_Continue.
     pub fn is_id_continue(c: char) -> bool {
         ('a' <= c && c <= 'z')
             || ('A' <= c && c <= 'Z')
             || ('0' <= c && c <= '9')
             || c == '_'
             || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
     }
-
-    #[cfg(not(feature = "unicode-xid"))]
-    pub fn is_id_continue(c: char) -> bool {
-        ('a' <= c && c <= 'z')
-            || ('A' <= c && c <= 'Z')
-            || ('0' <= c && c <= '9')
-            || c == '_'
-            || (c > '\x7f' && c.is_xid_continue())
-    }
 }
diff --git a/src/librustc_mir/Cargo.toml b/src/librustc_mir/Cargo.toml
@@ -20,6 +20,7 @@ rustc = { path = "../librustc" }
 rustc_target = { path = "../librustc_target" }
 rustc_data_structures = { path = "../librustc_data_structures" }
 rustc_errors = { path = "../librustc_errors" }
+rustc_lexer = { path = "../librustc_lexer" }
 rustc_serialize = { path = "../libserialize", package = "serialize" }
 syntax = { path = "../libsyntax" }
 syntax_pos = { path = "../libsyntax_pos" }
diff --git a/src/librustc_mir/borrow_check/move_errors.rs b/src/librustc_mir/borrow_check/move_errors.rs
@@ -1,8 +1,7 @@
-use core::unicode::property::Pattern_White_Space;
-
 use rustc::mir::*;
 use rustc::ty;
 use rustc_errors::{DiagnosticBuilder,Applicability};
+use rustc_lexer::character_properties::is_whitespace;
 use syntax_pos::Span;
 
 use crate::borrow_check::MirBorrowckCtxt;
@@ -526,7 +525,7 @@ impl<'a, 'tcx> MirBorrowckCtxt<'a, 'tcx> {
                         let suggestion;
                         let to_remove;
                         if pat_snippet.starts_with("mut")
-                            && pat_snippet["mut".len()..].starts_with(Pattern_White_Space)
+                            && pat_snippet["mut".len()..].starts_with(is_whitespace)
                         {
                             suggestion = pat_snippet["mut".len()..].trim_start();
                             to_remove = "&mut";
diff --git a/src/librustc_mir/borrow_check/mutability_errors.rs b/src/librustc_mir/borrow_check/mutability_errors.rs
@@ -1,4 +1,4 @@
-use core::unicode::property::Pattern_White_Space;
+use rustc_lexer::character_properties::is_whitespace;
 use rustc::hir;
 use rustc::hir::Node;
 use rustc::mir::{self, BindingForm, ClearCrossCrate, Local, Location, Body};
@@ -715,7 +715,7 @@ fn annotate_struct_field(
 fn suggest_ref_mut(tcx: TyCtxt<'_>, binding_span: Span) -> Option<String> {
     let hi_src = tcx.sess.source_map().span_to_snippet(binding_span).ok()?;
     if hi_src.starts_with("ref")
-        && hi_src["ref".len()..].starts_with(Pattern_White_Space)
+        && hi_src["ref".len()..].starts_with(is_whitespace)
     {
         let replacement = format!("ref mut{}", &hi_src["ref".len()..]);
         Some(replacement)
diff --git a/src/librustdoc/lib.rs b/src/librustdoc/lib.rs
@@ -33,6 +33,7 @@ extern crate rustc_interface;
 extern crate rustc_metadata;
 extern crate rustc_target;
 extern crate rustc_typeck;
+extern crate rustc_lexer;
 extern crate serialize;
 extern crate syntax;
 extern crate syntax_pos;
diff --git a/src/librustdoc/test.rs b/src/librustdoc/test.rs
@@ -4,6 +4,7 @@ use rustc::hir;
 use rustc::hir::intravisit;
 use rustc::session::{self, config, DiagnosticOutput};
 use rustc::util::common::ErrorReported;
+use rustc_lexer::character_properties::{is_id_start, is_id_continue};
 use syntax::ast;
 use syntax::with_globals;
 use syntax::source_map::SourceMap;
@@ -763,8 +764,8 @@ impl Tester for Collector {
             // We use these headings as test names, so it's good if
             // they're valid identifiers.
             let name = name.chars().enumerate().map(|(i, c)| {
-                    if (i == 0 && c.is_xid_start()) ||
-                        (i != 0 && c.is_xid_continue()) {
+                    if (i == 0 && is_id_start(c)) ||
+                        (i != 0 && is_id_continue(c)) {
                         c
                     } else {
                         '_'
diff --git a/src/libsyntax/ext/proc_macro_server.rs b/src/libsyntax/ext/proc_macro_server.rs
@@ -6,6 +6,7 @@ use crate::tokenstream::{self, DelimSpan, IsJoint::*, TokenStream, TreeAndJoint}
 
 use errors::{Diagnostic, DiagnosticBuilder};
 use rustc_data_structures::sync::Lrc;
+use rustc_lexer::character_properties::{is_id_start, is_id_continue};
 use syntax_pos::{BytePos, FileName, MultiSpan, Pos, SourceFile, Span};
 use syntax_pos::symbol::{kw, sym, Symbol};
 
@@ -322,8 +323,7 @@ impl Ident {
     fn is_valid(string: &str) -> bool {
         let mut chars = string.chars();
         if let Some(start) = chars.next() {
-            (start == '_' || start.is_xid_start())
-                && chars.all(|cont| cont == '_' || cont.is_xid_continue())
+            is_id_start(start) && chars.all(is_id_continue)
         } else {
             false
         }
diff --git a/src/libsyntax_ext/Cargo.toml b/src/libsyntax_ext/Cargo.toml
@@ -18,3 +18,4 @@ rustc_target = { path = "../librustc_target" }
 smallvec = { version = "0.6.7", features = ["union", "may_dangle"] }
 syntax = { path = "../libsyntax" }
 syntax_pos = { path = "../libsyntax_pos" }
+rustc_lexer = { path = "../librustc_lexer" }

Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,3 @@ pub mod derived_property {`
`13`	`13`	`pub mod conversions {`
`14`	`14`	`pub use crate::unicode::tables::conversions::{to_lower, to_upper};`
`15`	`15`	`}`
`16`		`-`
`17`		`-// For use in libsyntax`
`18`		`-pub mod property {`
`19`		`- pub use crate::unicode::tables::property::Pattern_White_Space;`
`20`		`-}`