Skip to content

Commit a0c186c

Browse files
committed
remove XID and Pattern_White_Space unicode tables from libcore
They are only used by rustc_lexer, and are not needed elsewhere. So we move the relevant definitions into rustc_lexer (while the actual unicode data comes from the unicode-xid crate) and make the rest of the compiler use it.
1 parent b9de4ef commit a0c186c

File tree

16 files changed

+69
-464
lines changed

16 files changed

+69
-464
lines changed

Cargo.lock

+13-4
Original file line numberDiff line numberDiff line change
@@ -1011,6 +1011,7 @@ dependencies = [
10111011
name = "fmt_macros"
10121012
version = "0.0.0"
10131013
dependencies = [
1014+
"rustc_lexer",
10141015
"syntax_pos",
10151016
]
10161017

@@ -2372,7 +2373,7 @@ version = "0.4.30"
23722373
source = "registry+https://github.com/rust-lang/crates.io-index"
23732374
checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759"
23742375
dependencies = [
2375-
"unicode-xid",
2376+
"unicode-xid 0.1.0",
23762377
]
23772378

23782379
[[package]]
@@ -3290,7 +3291,7 @@ dependencies = [
32903291
name = "rustc_lexer"
32913292
version = "0.1.0"
32923293
dependencies = [
3293-
"unicode-xid",
3294+
"unicode-xid 0.2.0",
32943295
]
32953296

32963297
[[package]]
@@ -3368,6 +3369,7 @@ dependencies = [
33683369
"rustc_apfloat",
33693370
"rustc_data_structures",
33703371
"rustc_errors",
3372+
"rustc_lexer",
33713373
"rustc_target",
33723374
"serialize",
33733375
"smallvec",
@@ -3976,7 +3978,7 @@ checksum = "641e117d55514d6d918490e47102f7e08d096fdde360247e4a10f7a91a8478d3"
39763978
dependencies = [
39773979
"proc-macro2",
39783980
"quote",
3979-
"unicode-xid",
3981+
"unicode-xid 0.1.0",
39803982
]
39813983

39823984
[[package]]
@@ -3988,7 +3990,7 @@ dependencies = [
39883990
"proc-macro2",
39893991
"quote",
39903992
"syn",
3991-
"unicode-xid",
3993+
"unicode-xid 0.1.0",
39923994
]
39933995

39943996
[[package]]
@@ -4017,6 +4019,7 @@ dependencies = [
40174019
"log",
40184020
"rustc_data_structures",
40194021
"rustc_errors",
4022+
"rustc_lexer",
40204023
"rustc_target",
40214024
"smallvec",
40224025
"syntax",
@@ -4532,6 +4535,12 @@ version = "0.1.0"
45324535
source = "registry+https://github.com/rust-lang/crates.io-index"
45334536
checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
45344537

4538+
[[package]]
4539+
name = "unicode-xid"
4540+
version = "0.2.0"
4541+
source = "registry+https://github.com/rust-lang/crates.io-index"
4542+
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
4543+
45354544
[[package]]
45364545
name = "unicode_categories"
45374546
version = "0.1.1"

src/libcore/char/methods.rs

-23
Original file line numberDiff line numberDiff line change
@@ -547,29 +547,6 @@ impl char {
547547
}
548548
}
549549

550-
/// Returns `true` if this `char` satisfies the `XID_Start` Unicode property, and false
551-
/// otherwise.
552-
///
553-
/// `XID_Start` is a Unicode Derived Property specified in
554-
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
555-
/// mostly similar to `ID_Start` but modified for closure under `NFKx`.
556-
#[unstable(feature = "unicode_internals", issue = "0")]
557-
pub fn is_xid_start(self) -> bool {
558-
derived_property::XID_Start(self)
559-
}
560-
561-
/// Returns `true` if this `char` satisfies the `XID_Continue` Unicode property, and false
562-
/// otherwise.
563-
///
564-
/// `XID_Continue` is a Unicode Derived Property specified in
565-
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
566-
/// mostly similar to `ID_Continue` but modified for closure under NFKx.
567-
#[unstable(feature = "unicode_internals", issue = "0")]
568-
#[inline]
569-
pub fn is_xid_continue(self) -> bool {
570-
derived_property::XID_Continue(self)
571-
}
572-
573550
/// Returns `true` if this `char` is lowercase.
574551
///
575552
/// 'Lowercase' is defined according to the terms of the Unicode Derived Core

src/libcore/unicode/mod.rs

-5
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,3 @@ pub mod derived_property {
1313
pub mod conversions {
1414
pub use crate::unicode::tables::conversions::{to_lower, to_upper};
1515
}
16-
17-
// For use in libsyntax
18-
pub mod property {
19-
pub use crate::unicode::tables::property::Pattern_White_Space;
20-
}

src/libcore/unicode/tables.rs

-375
Large diffs are not rendered by default.

src/libcore/unicode/unicode.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -728,7 +728,7 @@ def generate_property_module(mod, grouped_categories, category_subset):
728728

729729
yield "pub(crate) mod %s {\n" % mod
730730
for cat in sorted(category_subset):
731-
if cat in ("Cc", "White_Space", "Pattern_White_Space"):
731+
if cat in ("Cc", "White_Space"):
732732
generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
733733
else:
734734
generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
@@ -841,19 +841,18 @@ def main():
841841
unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
842842
load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
843843

844-
want_derived = {"XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
844+
want_derived = {"Alphabetic", "Lowercase", "Uppercase",
845845
"Cased", "Case_Ignorable", "Grapheme_Extend"}
846846
derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
847847

848848
props = load_properties(get_path(UnicodeFiles.PROPS),
849-
{"White_Space", "Join_Control", "Noncharacter_Code_Point",
850-
"Pattern_White_Space"})
849+
{"White_Space", "Join_Control", "Noncharacter_Code_Point"})
851850

852851
# Category tables
853852
for (name, categories, category_subset) in (
854853
("general_category", unicode_data.general_categories, ["N", "Cc"]),
855854
("derived_property", derived, want_derived),
856-
("property", props, ["White_Space", "Pattern_White_Space"])
855+
("property", props, ["White_Space"])
857856
):
858857
for fragment in generate_property_module(name, categories, category_subset):
859858
buf.write(fragment)

src/libfmt_macros/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ path = "lib.rs"
1010

1111
[dependencies]
1212
syntax_pos = { path = "../libsyntax_pos" }
13-
13+
rustc_lexer = { path = "../librustc_lexer" }

src/libfmt_macros/lib.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use std::string;
2323
use std::iter;
2424

2525
use syntax_pos::{InnerSpan, Symbol};
26+
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
2627

2728
#[derive(Copy, Clone)]
2829
struct InnerOffset(usize);
@@ -597,12 +598,11 @@ impl<'a> Parser<'a> {
597598
}
598599
}
599600

600-
/// Parses a word starting at the current position. A word is considered to
601-
/// be an alphabetic character followed by any number of alphanumeric
602-
/// characters.
601+
/// Parses a word starting at the current position. A word is the same as
602+
/// Rust identifier, except that it can't start with `_` character.
603603
fn word(&mut self) -> &'a str {
604604
let start = match self.cur.peek() {
605-
Some(&(pos, c)) if c.is_xid_start() => {
605+
Some(&(pos, c)) if c != '_' && is_id_start(c) => {
606606
self.cur.next();
607607
pos
608608
}
@@ -611,7 +611,7 @@ impl<'a> Parser<'a> {
611611
}
612612
};
613613
while let Some(&(pos, c)) = self.cur.peek() {
614-
if c.is_xid_continue() {
614+
if is_id_continue(c) {
615615
self.cur.next();
616616
} else {
617617
return &self.input[start..pos];

src/librustc_lexer/Cargo.toml

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ name = "rustc_lexer"
44
version = "0.1.0"
55
edition = "2018"
66

7-
# Note that this crate purposefully does not depend on other rustc crates
8-
[dependencies]
9-
unicode-xid = { version = "0.1.0", optional = true }
10-
117
# Note: do not remove this blank `[lib]` section.
128
# This will be used when publishing this crate as `rustc-ap-rustc_lexer`.
139
[lib]
1410
doctest = false
1511
name = "rustc_lexer"
12+
13+
# Note that this crate purposefully does not depend on other rustc crates
14+
[dependencies]
15+
unicode-xid = "0.2.0"

src/librustc_lexer/src/lib.rs

+30-33
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
// We want to be able to build this crate with a stable compiler, so feature
2-
// flags should be optional.
3-
#![cfg_attr(not(feature = "unicode-xid"), feature(unicode_internals))]
1+
// We want to be able to build this crate with a stable compiler, so no
2+
// `#![feature]` attributes should be added.
43

54
mod cursor;
65
pub mod unescape;
@@ -507,54 +506,52 @@ impl Cursor<'_> {
507506
}
508507

509508
pub mod character_properties {
510-
// this is Pattern_White_Space
511-
#[cfg(feature = "unicode-xid")]
509+
// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
510+
// classes.
511+
512+
// This is Pattern_White_Space.
513+
//
514+
// Note that this set is stable (ie, it doesn't change with different
515+
// Unicode versions), so it's ok to just hard-code the values.
512516
pub fn is_whitespace(c: char) -> bool {
513517
match c {
514-
'\u{0009}' | '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{0020}'
515-
| '\u{0085}' | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' => true,
518+
// Usual ASCII suspects
519+
| '\u{0009}' // \t
520+
| '\u{000A}' // \n
521+
| '\u{000B}' // vertical tab
522+
| '\u{000C}' // form feed
523+
| '\u{000D}' // \r
524+
| '\u{0020}' // space
525+
526+
// NEXT LINE from latin1
527+
| '\u{0085}'
528+
529+
// Bidi markers
530+
| '\u{200E}' // LEFT-TO-RIGHT MARK
531+
| '\u{200F}' // RIGHT-TO-LEFT MARK
532+
533+
// Dedicated whitespace characters from Unicode
534+
| '\u{2028}' // LINE SEPARATOR
535+
| '\u{2029}' // PARAGRAPH SEPARATOR
536+
=> true,
516537
_ => false,
517538
}
518539
}
519540

520-
#[cfg(not(feature = "unicode-xid"))]
521-
pub fn is_whitespace(c: char) -> bool {
522-
core::unicode::property::Pattern_White_Space(c)
523-
}
524-
525-
// this is XID_Start OR '_' (which formally is not a XID_Start)
526-
#[cfg(feature = "unicode-xid")]
541+
// This is XID_Start OR '_' (which formally is not a XID_Start).
527542
pub fn is_id_start(c: char) -> bool {
528543
('a' <= c && c <= 'z')
529544
|| ('A' <= c && c <= 'Z')
530545
|| c == '_'
531546
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
532547
}
533548

534-
#[cfg(not(feature = "unicode-xid"))]
535-
pub fn is_id_start(c: char) -> bool {
536-
('a' <= c && c <= 'z')
537-
|| ('A' <= c && c <= 'Z')
538-
|| c == '_'
539-
|| (c > '\x7f' && c.is_xid_start())
540-
}
541-
542-
// this is XID_Continue
543-
#[cfg(feature = "unicode-xid")]
549+
// This is XID_Continue.
544550
pub fn is_id_continue(c: char) -> bool {
545551
('a' <= c && c <= 'z')
546552
|| ('A' <= c && c <= 'Z')
547553
|| ('0' <= c && c <= '9')
548554
|| c == '_'
549555
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
550556
}
551-
552-
#[cfg(not(feature = "unicode-xid"))]
553-
pub fn is_id_continue(c: char) -> bool {
554-
('a' <= c && c <= 'z')
555-
|| ('A' <= c && c <= 'Z')
556-
|| ('0' <= c && c <= '9')
557-
|| c == '_'
558-
|| (c > '\x7f' && c.is_xid_continue())
559-
}
560557
}

src/librustc_mir/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ rustc = { path = "../librustc" }
2020
rustc_target = { path = "../librustc_target" }
2121
rustc_data_structures = { path = "../librustc_data_structures" }
2222
rustc_errors = { path = "../librustc_errors" }
23+
rustc_lexer = { path = "../librustc_lexer" }
2324
rustc_serialize = { path = "../libserialize", package = "serialize" }
2425
syntax = { path = "../libsyntax" }
2526
syntax_pos = { path = "../libsyntax_pos" }

src/librustc_mir/borrow_check/move_errors.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
use core::unicode::property::Pattern_White_Space;
2-
31
use rustc::mir::*;
42
use rustc::ty;
53
use rustc_errors::{DiagnosticBuilder,Applicability};
4+
use rustc_lexer::character_properties::is_whitespace;
65
use syntax_pos::Span;
76

87
use crate::borrow_check::MirBorrowckCtxt;
@@ -526,7 +525,7 @@ impl<'a, 'tcx> MirBorrowckCtxt<'a, 'tcx> {
526525
let suggestion;
527526
let to_remove;
528527
if pat_snippet.starts_with("mut")
529-
&& pat_snippet["mut".len()..].starts_with(Pattern_White_Space)
528+
&& pat_snippet["mut".len()..].starts_with(is_whitespace)
530529
{
531530
suggestion = pat_snippet["mut".len()..].trim_start();
532531
to_remove = "&mut";

src/librustc_mir/borrow_check/mutability_errors.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use core::unicode::property::Pattern_White_Space;
1+
use rustc_lexer::character_properties::is_whitespace;
22
use rustc::hir;
33
use rustc::hir::Node;
44
use rustc::mir::{self, BindingForm, ClearCrossCrate, Local, Location, Body};
@@ -715,7 +715,7 @@ fn annotate_struct_field(
715715
fn suggest_ref_mut(tcx: TyCtxt<'_>, binding_span: Span) -> Option<String> {
716716
let hi_src = tcx.sess.source_map().span_to_snippet(binding_span).ok()?;
717717
if hi_src.starts_with("ref")
718-
&& hi_src["ref".len()..].starts_with(Pattern_White_Space)
718+
&& hi_src["ref".len()..].starts_with(is_whitespace)
719719
{
720720
let replacement = format!("ref mut{}", &hi_src["ref".len()..]);
721721
Some(replacement)

src/librustdoc/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ extern crate rustc_interface;
3333
extern crate rustc_metadata;
3434
extern crate rustc_target;
3535
extern crate rustc_typeck;
36+
extern crate rustc_lexer;
3637
extern crate serialize;
3738
extern crate syntax;
3839
extern crate syntax_pos;

src/librustdoc/test.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use rustc::hir;
44
use rustc::hir::intravisit;
55
use rustc::session::{self, config, DiagnosticOutput};
66
use rustc::util::common::ErrorReported;
7+
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
78
use syntax::ast;
89
use syntax::with_globals;
910
use syntax::source_map::SourceMap;
@@ -763,8 +764,8 @@ impl Tester for Collector {
763764
// We use these headings as test names, so it's good if
764765
// they're valid identifiers.
765766
let name = name.chars().enumerate().map(|(i, c)| {
766-
if (i == 0 && c.is_xid_start()) ||
767-
(i != 0 && c.is_xid_continue()) {
767+
if (i == 0 && is_id_start(c)) ||
768+
(i != 0 && is_id_continue(c)) {
768769
c
769770
} else {
770771
'_'

src/libsyntax/ext/proc_macro_server.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crate::tokenstream::{self, DelimSpan, IsJoint::*, TokenStream, TreeAndJoint}
66

77
use errors::{Diagnostic, DiagnosticBuilder};
88
use rustc_data_structures::sync::Lrc;
9+
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
910
use syntax_pos::{BytePos, FileName, MultiSpan, Pos, SourceFile, Span};
1011
use syntax_pos::symbol::{kw, sym, Symbol};
1112

@@ -322,8 +323,7 @@ impl Ident {
322323
fn is_valid(string: &str) -> bool {
323324
let mut chars = string.chars();
324325
if let Some(start) = chars.next() {
325-
(start == '_' || start.is_xid_start())
326-
&& chars.all(|cont| cont == '_' || cont.is_xid_continue())
326+
is_id_start(start) && chars.all(is_id_continue)
327327
} else {
328328
false
329329
}

src/libsyntax_ext/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ rustc_target = { path = "../librustc_target" }
1818
smallvec = { version = "0.6.7", features = ["union", "may_dangle"] }
1919
syntax = { path = "../libsyntax" }
2020
syntax_pos = { path = "../libsyntax_pos" }
21+
rustc_lexer = { path = "../librustc_lexer" }

0 commit comments

Comments
 (0)