Skip to content

Commit 25e864e

Browse files
committed
Implement mixed script confusable lint.
1 parent ef24faf commit 25e864e

8 files changed

+206
-12
lines changed

Cargo.lock

+4-4
Original file line numberDiff line numberDiff line change
@@ -5405,15 +5405,15 @@ dependencies = [
54055405

54065406
[[package]]
54075407
name = "unicode-script"
5408-
version = "0.4.0"
5408+
version = "0.5.1"
54095409
source = "registry+https://github.com/rust-lang/crates.io-index"
5410-
checksum = "5b2c5c29e805da6817f5af6a627d65adb045cebf05cccd5a3493d6109454391c"
5410+
checksum = "58b33414ea8db4b7ea0343548dbdc31d27aef06beacf7044a87e564d9b0feb7d"
54115411

54125412
[[package]]
54135413
name = "unicode-security"
5414-
version = "0.0.3"
5414+
version = "0.0.5"
54155415
source = "registry+https://github.com/rust-lang/crates.io-index"
5416-
checksum = "a5f9011bbed9c13372bc8df618b55a38138445199caf3b61d432c6859c36dee0"
5416+
checksum = "5d87c28edc5b263377e448d6cdcb935c06b95413d8013ba6fae470558ccab18f"
54175417
dependencies = [
54185418
"unicode-normalization",
54195419
"unicode-script",

src/librustc_lint/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ path = "lib.rs"
1010

1111
[dependencies]
1212
log = "0.4"
13-
unicode-security = "0.0.3"
13+
unicode-security = "0.0.5"
1414
rustc_middle = { path = "../librustc_middle" }
1515
rustc_ast_pretty = { path = "../librustc_ast_pretty" }
1616
rustc_attr = { path = "../librustc_attr" }

src/librustc_lint/non_ascii_idents.rs

+125-2
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,35 @@ declare_lint! {
2424
crate_level_only
2525
}
2626

27-
declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS]);
27+
declare_lint! {
28+
pub MIXED_SCRIPT_CONFUSABLES,
29+
Warn,
30+
"detects Unicode scripts whose mixed script confusables codepoints are solely used",
31+
crate_level_only
32+
}
33+
34+
declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
2835

2936
impl EarlyLintPass for NonAsciiIdents {
3037
fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
3138
use rustc_session::lint::Level;
3239
use rustc_span::Span;
40+
use std::collections::BTreeMap;
3341
use unicode_security::GeneralSecurityProfile;
3442
use utils::CowBoxSymStr;
3543

3644
let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
3745
let check_uncommon_codepoints =
3846
cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
3947
let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
48+
let check_mixed_script_confusables =
49+
cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
4050

41-
if !check_non_ascii_idents && !check_uncommon_codepoints && !check_confusable_idents {
51+
if !check_non_ascii_idents
52+
&& !check_uncommon_codepoints
53+
&& !check_confusable_idents
54+
&& !check_mixed_script_confusables
55+
{
4256
return;
4357
}
4458

@@ -107,6 +121,115 @@ impl EarlyLintPass for NonAsciiIdents {
107121
.or_insert((symbol_str, sp, is_ascii));
108122
}
109123
}
124+
125+
if has_non_ascii_idents && check_mixed_script_confusables {
126+
use unicode_security::is_potential_mixed_script_confusable_char;
127+
use unicode_security::mixed_script::AugmentedScriptSet;
128+
129+
#[derive(Clone)]
130+
enum ScriptSetUsage {
131+
Suspicious(Vec<char>, Span),
132+
Verified,
133+
}
134+
135+
let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
136+
FxHashMap::default();
137+
let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
138+
script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
139+
140+
let mut has_suspicous = false;
141+
for (symbol, &sp) in symbols.iter() {
142+
let symbol_str = symbol.as_str();
143+
for ch in symbol_str.chars() {
144+
if ch.is_ascii() {
145+
// all ascii characters are covered by exception.
146+
continue;
147+
}
148+
if !GeneralSecurityProfile::identifier_allowed(ch) {
149+
// this character is covered by `uncommon_codepoints` lint.
150+
continue;
151+
}
152+
let augmented_script_set = AugmentedScriptSet::for_char(ch);
153+
script_states
154+
.entry(augmented_script_set)
155+
.and_modify(|existing_state| {
156+
if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
157+
if is_potential_mixed_script_confusable_char(ch) {
158+
ch_list.push(ch);
159+
} else {
160+
*existing_state = ScriptSetUsage::Verified;
161+
}
162+
}
163+
})
164+
.or_insert_with(|| {
165+
if !is_potential_mixed_script_confusable_char(ch) {
166+
ScriptSetUsage::Verified
167+
} else {
168+
has_suspicous = true;
169+
ScriptSetUsage::Suspicious(vec![ch], sp)
170+
}
171+
});
172+
}
173+
}
174+
175+
if has_suspicous {
176+
let verified_augmented_script_sets = script_states
177+
.iter()
178+
.flat_map(|(k, v)| match v {
179+
ScriptSetUsage::Verified => Some(*k),
180+
_ => None,
181+
})
182+
.collect::<Vec<_>>();
183+
184+
// we're sorting the output here.
185+
let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
186+
BTreeMap::new();
187+
188+
'outerloop: for (augment_script_set, usage) in script_states {
189+
let (mut ch_list, sp) = match usage {
190+
ScriptSetUsage::Verified => continue,
191+
ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp),
192+
};
193+
194+
if augment_script_set.is_all() {
195+
continue;
196+
}
197+
198+
for existing in verified_augmented_script_sets.iter() {
199+
if existing.is_all() {
200+
continue;
201+
}
202+
let mut intersect = *existing;
203+
intersect.intersect_with(augment_script_set);
204+
if !intersect.is_empty() && !intersect.is_all() {
205+
continue 'outerloop;
206+
}
207+
}
208+
209+
ch_list.sort();
210+
ch_list.dedup();
211+
lint_reports.insert((sp, ch_list), augment_script_set);
212+
}
213+
214+
for ((sp, ch_list), script_set) in lint_reports {
215+
cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
216+
let message = format!(
217+
"The usage of Script Group `{}` in this crate consists solely of mixed script confusables",
218+
script_set);
219+
let mut note = "The usage includes ".to_string();
220+
for (idx, ch) in ch_list.into_iter().enumerate() {
221+
if idx != 0 {
222+
note += ", ";
223+
}
224+
let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
225+
note += &char_info;
226+
}
227+
note += ".";
228+
lint.build(&message).note(&note).note("Please recheck to make sure their usages are indeed what you want.").emit()
229+
});
230+
}
231+
}
232+
}
110233
}
111234
}
112235

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// check-pass
2+
#![feature(non_ascii_idents)]
3+
#![deny(mixed_script_confusables)]
4+
5+
struct ΑctuallyNotLatin;
6+
7+
fn main() {
8+
let λ = 42; // this usage of Greek confirms that Greek is used intentionally.
9+
}
10+
11+
mod роре {
12+
const: &'static str = "アイウ";
13+
14+
// this usage of Katakana confirms that Katakana is used intentionally.
15+
fn ニャン() {
16+
let д: usize = 100; // this usage of Cyrillic confirms that Cyrillic is used intentionally.
17+
18+
println!("meow!");
19+
}
20+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#![feature(non_ascii_idents)]
2+
#![deny(mixed_script_confusables)]
3+
4+
struct ΑctuallyNotLatin;
5+
//~^ ERROR The usage of Script Group `Greek` in this crate consists solely of
6+
7+
fn main() {
8+
let v = ΑctuallyNotLatin;
9+
}
10+
11+
mod роре {
12+
//~^ ERROR The usage of Script Group `Cyrillic` in this crate consists solely of
13+
const: &'static str = "アイウ";
14+
//~^ ERROR The usage of Script Group `Japanese, Katakana` in this crate consists solely of
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
error: The usage of Script Group `Greek` in this crate consists solely of mixed script confusables
2+
--> $DIR/lint-mixed-script-confusables.rs:4:8
3+
|
4+
LL | struct ΑctuallyNotLatin;
5+
| ^^^^^^^^^^^^^^^^
6+
|
7+
note: the lint level is defined here
8+
--> $DIR/lint-mixed-script-confusables.rs:2:9
9+
|
10+
LL | #![deny(mixed_script_confusables)]
11+
| ^^^^^^^^^^^^^^^^^^^^^^^^
12+
= note: The usage includes 'Α' (U+0391).
13+
= note: Please recheck to make sure their usages are indeed what you want.
14+
15+
error: The usage of Script Group `Cyrillic` in this crate consists solely of mixed script confusables
16+
--> $DIR/lint-mixed-script-confusables.rs:11:5
17+
|
18+
LL | mod роре {
19+
| ^^^^
20+
|
21+
= note: The usage includes 'е' (U+0435), 'о' (U+043E), 'р' (U+0440).
22+
= note: Please recheck to make sure their usages are indeed what you want.
23+
24+
error: The usage of Script Group `Japanese, Katakana` in this crate consists solely of mixed script confusables
25+
--> $DIR/lint-mixed-script-confusables.rs:13:11
26+
|
27+
LL | const エ: &'static str = "アイウ";
28+
| ^^
29+
|
30+
= note: The usage includes 'エ' (U+30A8).
31+
= note: Please recheck to make sure their usages are indeed what you want.
32+
33+
error: aborting due to 3 previous errors
34+

src/test/ui/utf8_idents.rs

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#![allow(mixed_script_confusables)]
2+
13
fn foo<
24
'β, //~ ERROR non-ascii idents are not fully supported
35
γ //~ ERROR non-ascii idents are not fully supported

src/test/ui/utf8_idents.stderr

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
error[E0658]: non-ascii idents are not fully supported
2-
--> $DIR/utf8_idents.rs:2:5
2+
--> $DIR/utf8_idents.rs:4:5
33
|
44
LL | 'β,
55
| ^^
@@ -8,7 +8,7 @@ LL | 'β,
88
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
99

1010
error[E0658]: non-ascii idents are not fully supported
11-
--> $DIR/utf8_idents.rs:3:5
11+
--> $DIR/utf8_idents.rs:5:5
1212
|
1313
LL | γ
1414
| ^
@@ -17,7 +17,7 @@ LL | γ
1717
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
1818

1919
error[E0658]: non-ascii idents are not fully supported
20-
--> $DIR/utf8_idents.rs:8:5
20+
--> $DIR/utf8_idents.rs:10:5
2121
|
2222
LL | δ: usize
2323
| ^
@@ -26,7 +26,7 @@ LL | δ: usize
2626
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
2727

2828
error[E0658]: non-ascii idents are not fully supported
29-
--> $DIR/utf8_idents.rs:12:9
29+
--> $DIR/utf8_idents.rs:14:9
3030
|
3131
LL | let α = 0.00001f64;
3232
| ^
@@ -35,7 +35,7 @@ LL | let α = 0.00001f64;
3535
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
3636

3737
warning: type parameter `γ` should have an upper camel case name
38-
--> $DIR/utf8_idents.rs:3:5
38+
--> $DIR/utf8_idents.rs:5:5
3939
|
4040
LL | γ
4141
| ^ help: convert the identifier to upper camel case: `Γ`

0 commit comments

Comments
 (0)