Skip to content

Commit 55adb69

Browse files
committed
optimization: Eliminate Cased table
`Cased` is a derived property - it is the union of the `Lowercase` property, the `Uppercase` property, and the `Titlecase_Letter` general categories. We already have lookup tables for `Lowercase` and `Uppercase`, and `Titlecase_Letter` is very small. So instead of duplicating a lookup table for `Cased`, just test each of those properties in turn. This probably will be slower than the old approach, but it is not a public API: it is only used in `string::to_lower` when deciding when a Greek "sigma" should be mapped to `ς` or to `σ`. This is a very rare case, so should not be performance sensitive.
1 parent fca2e94 commit 55adb69

File tree

5 files changed

+48
-123
lines changed

5 files changed

+48
-123
lines changed

library/core/src/char/methods.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,11 @@ impl char {
985985
#[doc(hidden)]
986986
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987987
pub fn is_cased(self) -> bool {
988-
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
988+
if self.is_ascii() {
989+
self.is_ascii_alphabetic()
990+
} else {
991+
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
992+
}
989993
}
990994

991995
/// Returns `true` if this `char` has the `Case_Ignorable` property.

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
// for use in alloc, not re-exported in std.
66
#[rustfmt::skip]
77
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
8-
pub use unicode_data::cased::lookup as Cased;
98
pub use unicode_data::conversions;
109

1110
#[rustfmt::skip]
1211
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
1312
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1413
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
14+
pub(crate) use unicode_data::lt::lookup as Lt;
1515
pub(crate) use unicode_data::n::lookup as N;
1616
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
1717
pub(crate) use unicode_data::white_space::lookup as White_Space;

library/core/src/unicode/unicode_data.rs

Lines changed: 34 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
22
// Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist
33
// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist
4-
// Cased : 401 bytes, 4580 codepoints in 156 ranges (U+0000AA - U+01F18A) using skiplist
54
// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist
65
// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset
6+
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
77
// N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist
88
// Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset
99
// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading
1010
// to_lower : 11708 bytes
1111
// to_upper : 13656 bytes
12-
// Total : 31911 bytes
12+
// Total : 31543 bytes
1313

1414
use super::rt::*;
1515

@@ -253,70 +253,6 @@ pub mod case_ignorable {
253253
}
254254
}
255255

256-
pub mod cased {
257-
use super::ShortOffsetRunHeader;
258-
259-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [
260-
ShortOffsetRunHeader::new(0, 4256),
261-
ShortOffsetRunHeader::new(51, 5024),
262-
ShortOffsetRunHeader::new(61, 7296),
263-
ShortOffsetRunHeader::new(65, 7958),
264-
ShortOffsetRunHeader::new(74, 9398),
265-
ShortOffsetRunHeader::new(149, 11264),
266-
ShortOffsetRunHeader::new(151, 42560),
267-
ShortOffsetRunHeader::new(163, 43824),
268-
ShortOffsetRunHeader::new(177, 64256),
269-
ShortOffsetRunHeader::new(183, 65313),
270-
ShortOffsetRunHeader::new(187, 66560),
271-
ShortOffsetRunHeader::new(191, 67456),
272-
ShortOffsetRunHeader::new(213, 68736),
273-
ShortOffsetRunHeader::new(221, 71840),
274-
ShortOffsetRunHeader::new(229, 93760),
275-
ShortOffsetRunHeader::new(231, 119808),
276-
ShortOffsetRunHeader::new(237, 120486),
277-
ShortOffsetRunHeader::new(274, 122624),
278-
ShortOffsetRunHeader::new(297, 122928),
279-
ShortOffsetRunHeader::new(303, 125184),
280-
ShortOffsetRunHeader::new(305, 127280),
281-
ShortOffsetRunHeader::new(307, 1241482),
282-
];
283-
static OFFSETS: [u8; 313] = [
284-
170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 2, 35, 7, 2, 30, 5, 96, 1, 42, 4,
285-
2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1,
286-
1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6,
287-
2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1,
288-
7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4,
289-
1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46,
290-
18, 30, 132, 102, 3, 4, 1, 77, 20, 6, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, 26,
291-
0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, 1,
292-
42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 32, 25, 2, 25, 0, 85, 1, 71, 1, 2,
293-
2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3,
294-
7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10,
295-
1, 20, 6, 6, 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
296-
];
297-
298-
#[inline]
299-
pub fn lookup(c: char) -> bool {
300-
debug_assert!(!c.is_ascii());
301-
(c as u32) >= 0xaa && lookup_slow(c)
302-
}
303-
304-
#[inline(never)]
305-
fn lookup_slow(c: char) -> bool {
306-
const {
307-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
308-
let mut i = 0;
309-
while i < SHORT_OFFSET_RUNS.len() {
310-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
311-
i += 1;
312-
}
313-
}
314-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
315-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
316-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
317-
}
318-
}
319-
320256
pub mod grapheme_extend {
321257
use super::ShortOffsetRunHeader;
322258

@@ -535,6 +471,38 @@ pub mod lowercase {
535471
}
536472
}
537473

474+
pub mod lt {
475+
use super::ShortOffsetRunHeader;
476+
477+
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
478+
ShortOffsetRunHeader::new(0, 453),
479+
ShortOffsetRunHeader::new(1, 8072),
480+
ShortOffsetRunHeader::new(9, 1122301),
481+
];
482+
static OFFSETS: [u8; 21] = [0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0];
483+
484+
#[inline]
485+
pub fn lookup(c: char) -> bool {
486+
debug_assert!(!c.is_ascii());
487+
(c as u32) >= 0x1c5 && lookup_slow(c)
488+
}
489+
490+
#[inline(never)]
491+
fn lookup_slow(c: char) -> bool {
492+
const {
493+
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
494+
let mut i = 0;
495+
while i < SHORT_OFFSET_RUNS.len() {
496+
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
497+
i += 1;
498+
}
499+
}
500+
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
501+
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
502+
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
503+
}
504+
}
505+
538506
pub mod n {
539507
use super::ShortOffsetRunHeader;
540508

library/coretests/tests/unicode/test_data.rs

Lines changed: 7 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -392,60 +392,6 @@ pub(super) static CASE_IGNORABLE: &[RangeInclusive<char>; 459] = &[
392392
'\u{e0100}'..='\u{e01ef}',
393393
];
394394

395-
#[rustfmt::skip]
396-
pub(super) static CASED: &[RangeInclusive<char>; 156] = &[
397-
'\u{aa}'..='\u{aa}', '\u{b5}'..='\u{b5}', '\u{ba}'..='\u{ba}', '\u{c0}'..='\u{d6}',
398-
'\u{d8}'..='\u{f6}', '\u{f8}'..='\u{1ba}', '\u{1bc}'..='\u{1bf}', '\u{1c4}'..='\u{293}',
399-
'\u{296}'..='\u{2b8}', '\u{2c0}'..='\u{2c1}', '\u{2e0}'..='\u{2e4}', '\u{345}'..='\u{345}',
400-
'\u{370}'..='\u{373}', '\u{376}'..='\u{377}', '\u{37a}'..='\u{37d}', '\u{37f}'..='\u{37f}',
401-
'\u{386}'..='\u{386}', '\u{388}'..='\u{38a}', '\u{38c}'..='\u{38c}', '\u{38e}'..='\u{3a1}',
402-
'\u{3a3}'..='\u{3f5}', '\u{3f7}'..='\u{481}', '\u{48a}'..='\u{52f}', '\u{531}'..='\u{556}',
403-
'\u{560}'..='\u{588}', '\u{10a0}'..='\u{10c5}', '\u{10c7}'..='\u{10c7}',
404-
'\u{10cd}'..='\u{10cd}', '\u{10d0}'..='\u{10fa}', '\u{10fc}'..='\u{10ff}',
405-
'\u{13a0}'..='\u{13f5}', '\u{13f8}'..='\u{13fd}', '\u{1c80}'..='\u{1c8a}',
406-
'\u{1c90}'..='\u{1cba}', '\u{1cbd}'..='\u{1cbf}', '\u{1d00}'..='\u{1dbf}',
407-
'\u{1e00}'..='\u{1f15}', '\u{1f18}'..='\u{1f1d}', '\u{1f20}'..='\u{1f45}',
408-
'\u{1f48}'..='\u{1f4d}', '\u{1f50}'..='\u{1f57}', '\u{1f59}'..='\u{1f59}',
409-
'\u{1f5b}'..='\u{1f5b}', '\u{1f5d}'..='\u{1f5d}', '\u{1f5f}'..='\u{1f7d}',
410-
'\u{1f80}'..='\u{1fb4}', '\u{1fb6}'..='\u{1fbc}', '\u{1fbe}'..='\u{1fbe}',
411-
'\u{1fc2}'..='\u{1fc4}', '\u{1fc6}'..='\u{1fcc}', '\u{1fd0}'..='\u{1fd3}',
412-
'\u{1fd6}'..='\u{1fdb}', '\u{1fe0}'..='\u{1fec}', '\u{1ff2}'..='\u{1ff4}',
413-
'\u{1ff6}'..='\u{1ffc}', '\u{2071}'..='\u{2071}', '\u{207f}'..='\u{207f}',
414-
'\u{2090}'..='\u{209c}', '\u{2102}'..='\u{2102}', '\u{2107}'..='\u{2107}',
415-
'\u{210a}'..='\u{2113}', '\u{2115}'..='\u{2115}', '\u{2119}'..='\u{211d}',
416-
'\u{2124}'..='\u{2124}', '\u{2126}'..='\u{2126}', '\u{2128}'..='\u{2128}',
417-
'\u{212a}'..='\u{212d}', '\u{212f}'..='\u{2134}', '\u{2139}'..='\u{2139}',
418-
'\u{213c}'..='\u{213f}', '\u{2145}'..='\u{2149}', '\u{214e}'..='\u{214e}',
419-
'\u{2160}'..='\u{217f}', '\u{2183}'..='\u{2184}', '\u{24b6}'..='\u{24e9}',
420-
'\u{2c00}'..='\u{2ce4}', '\u{2ceb}'..='\u{2cee}', '\u{2cf2}'..='\u{2cf3}',
421-
'\u{2d00}'..='\u{2d25}', '\u{2d27}'..='\u{2d27}', '\u{2d2d}'..='\u{2d2d}',
422-
'\u{a640}'..='\u{a66d}', '\u{a680}'..='\u{a69d}', '\u{a722}'..='\u{a787}',
423-
'\u{a78b}'..='\u{a78e}', '\u{a790}'..='\u{a7dc}', '\u{a7f1}'..='\u{a7f6}',
424-
'\u{a7f8}'..='\u{a7fa}', '\u{ab30}'..='\u{ab5a}', '\u{ab5c}'..='\u{ab69}',
425-
'\u{ab70}'..='\u{abbf}', '\u{fb00}'..='\u{fb06}', '\u{fb13}'..='\u{fb17}',
426-
'\u{ff21}'..='\u{ff3a}', '\u{ff41}'..='\u{ff5a}', '\u{10400}'..='\u{1044f}',
427-
'\u{104b0}'..='\u{104d3}', '\u{104d8}'..='\u{104fb}', '\u{10570}'..='\u{1057a}',
428-
'\u{1057c}'..='\u{1058a}', '\u{1058c}'..='\u{10592}', '\u{10594}'..='\u{10595}',
429-
'\u{10597}'..='\u{105a1}', '\u{105a3}'..='\u{105b1}', '\u{105b3}'..='\u{105b9}',
430-
'\u{105bb}'..='\u{105bc}', '\u{10780}'..='\u{10780}', '\u{10783}'..='\u{10785}',
431-
'\u{10787}'..='\u{107b0}', '\u{107b2}'..='\u{107ba}', '\u{10c80}'..='\u{10cb2}',
432-
'\u{10cc0}'..='\u{10cf2}', '\u{10d50}'..='\u{10d65}', '\u{10d70}'..='\u{10d85}',
433-
'\u{118a0}'..='\u{118df}', '\u{16e40}'..='\u{16e7f}', '\u{16ea0}'..='\u{16eb8}',
434-
'\u{16ebb}'..='\u{16ed3}', '\u{1d400}'..='\u{1d454}', '\u{1d456}'..='\u{1d49c}',
435-
'\u{1d49e}'..='\u{1d49f}', '\u{1d4a2}'..='\u{1d4a2}', '\u{1d4a5}'..='\u{1d4a6}',
436-
'\u{1d4a9}'..='\u{1d4ac}', '\u{1d4ae}'..='\u{1d4b9}', '\u{1d4bb}'..='\u{1d4bb}',
437-
'\u{1d4bd}'..='\u{1d4c3}', '\u{1d4c5}'..='\u{1d505}', '\u{1d507}'..='\u{1d50a}',
438-
'\u{1d50d}'..='\u{1d514}', '\u{1d516}'..='\u{1d51c}', '\u{1d51e}'..='\u{1d539}',
439-
'\u{1d53b}'..='\u{1d53e}', '\u{1d540}'..='\u{1d544}', '\u{1d546}'..='\u{1d546}',
440-
'\u{1d54a}'..='\u{1d550}', '\u{1d552}'..='\u{1d6a5}', '\u{1d6a8}'..='\u{1d6c0}',
441-
'\u{1d6c2}'..='\u{1d6da}', '\u{1d6dc}'..='\u{1d6fa}', '\u{1d6fc}'..='\u{1d714}',
442-
'\u{1d716}'..='\u{1d734}', '\u{1d736}'..='\u{1d74e}', '\u{1d750}'..='\u{1d76e}',
443-
'\u{1d770}'..='\u{1d788}', '\u{1d78a}'..='\u{1d7a8}', '\u{1d7aa}'..='\u{1d7c2}',
444-
'\u{1d7c4}'..='\u{1d7cb}', '\u{1df00}'..='\u{1df09}', '\u{1df0b}'..='\u{1df1e}',
445-
'\u{1df25}'..='\u{1df2a}', '\u{1e030}'..='\u{1e06d}', '\u{1e900}'..='\u{1e943}',
446-
'\u{1f130}'..='\u{1f149}', '\u{1f150}'..='\u{1f169}', '\u{1f170}'..='\u{1f189}',
447-
];
448-
449395
#[rustfmt::skip]
450396
pub(super) static GRAPHEME_EXTEND: &[RangeInclusive<char>; 383] = &[
451397
'\u{300}'..='\u{36f}', '\u{483}'..='\u{489}', '\u{591}'..='\u{5bd}', '\u{5bf}'..='\u{5bf}',
@@ -776,6 +722,13 @@ pub(super) static LOWERCASE: &[RangeInclusive<char>; 676] = &[
776722
'\u{1e030}'..='\u{1e06d}', '\u{1e922}'..='\u{1e943}',
777723
];
778724

725+
#[rustfmt::skip]
726+
pub(super) static LT: &[RangeInclusive<char>; 10] = &[
727+
'\u{1c5}'..='\u{1c5}', '\u{1c8}'..='\u{1c8}', '\u{1cb}'..='\u{1cb}', '\u{1f2}'..='\u{1f2}',
728+
'\u{1f88}'..='\u{1f8f}', '\u{1f98}'..='\u{1f9f}', '\u{1fa8}'..='\u{1faf}',
729+
'\u{1fbc}'..='\u{1fbc}', '\u{1fcc}'..='\u{1fcc}', '\u{1ffc}'..='\u{1ffc}',
730+
];
731+
779732
#[rustfmt::skip]
780733
pub(super) static N: &[RangeInclusive<char>; 145] = &[
781734
'\u{b2}'..='\u{b3}', '\u{b9}'..='\u{b9}', '\u{bc}'..='\u{be}', '\u{660}'..='\u{669}',

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static PROPERTIES: &[&str] = &[
9090
"Alphabetic",
9191
"Lowercase",
9292
"Uppercase",
93-
"Cased",
93+
"Lt",
9494
"Case_Ignorable",
9595
"Grapheme_Extend",
9696
"White_Space",

0 commit comments

Comments
 (0)