Skip to content

Commit 18566a7

Browse files
committed
Messy, but effective subset-alternation for Utf8/ASCII text
1 parent a53034e commit 18566a7

File tree

6 files changed

+161
-9
lines changed

6 files changed

+161
-9
lines changed

src/bin/doodle/format/text.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,15 +132,17 @@ pub fn main(module: &mut FormatModule, base: &BaseModule) -> FormatRef {
132132
),
133133
);
134134

135-
let ascii_str = module.define_format("text.string.ascii", repeat1(base.ascii_char_strict()));
136-
let utf8_str = module.define_format("text.string.utf8", repeat(utf8_char.call()));
135+
let ascii_char = module.define_format(
136+
"text.char.ascii",
137+
Format::Map(
138+
Box::new(base.ascii_char_strict()),
139+
Expr::Lambda("byte".into(), Box::new(Expr::AsChar(Box::new(var("byte"))))),
140+
),
141+
);
137142

138143
module.define_format(
139144
"text.string",
140-
Format::NondetUnion(vec![
141-
("ascii".into(), ascii_str.call()),
142-
("utf8".into(), utf8_str.call()),
143-
]),
145+
Format::RepeatFallback(Box::new(ascii_char.call()), Box::new(utf8_char.call())),
144146
)
145147
}
146148

src/lib.rs

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ pub enum Value {
133133
Variant(Cow<'static, str>, Box<Value>),
134134
Seq(Vec<Value>),
135135
Mapped(Box<Value>, Box<Value>),
136+
Fallback(bool, Box<Value>),
136137
Format(Box<Format>),
137138
}
138139

@@ -466,6 +467,8 @@ pub enum Format {
466467
RepeatUntilLast(Expr, Box<Format>),
467468
/// Repeat a format until a condition is satisfied by the sequence
468469
RepeatUntilSeq(Expr, Box<Format>),
470+
/// Repeat an eager narrow format, but continue with a broader item if necessary upon recoverable failure
471+
RepeatFallback(Box<Format>, Box<Format>),
469472
/// Parse a format without advancing the stream position afterwards
470473
Peek(Box<Format>),
471474
/// Attempt to parse a format and fail if it succeeds
@@ -660,6 +663,11 @@ impl FormatModule {
660663
let t = self.infer_format_type(scope, a)?;
661664
Ok(ValueType::Seq(Box::new(t)))
662665
}
666+
Format::RepeatFallback(narrow, wide) => {
667+
let mut t = self.infer_format_type(scope, narrow)?;
668+
t = t.unify(&self.infer_format_type(scope, wide)?)?;
669+
Ok(ValueType::Seq(Box::new(t)))
670+
}
663671
Format::Peek(a) => self.infer_format_type(scope, a),
664672
Format::PeekNot(_a) => Ok(ValueType::Tuple(vec![])),
665673
Format::Slice(_expr, a) => self.infer_format_type(scope, a),
@@ -771,6 +779,7 @@ enum Decoder {
771779
Record(Vec<(Cow<'static, str>, Decoder)>),
772780
While(MatchTree, Box<Decoder>),
773781
Until(MatchTree, Box<Decoder>),
782+
RepeatFallback(MatchTree, Box<Decoder>, Box<Decoder>),
774783
RepeatCount(Expr, Box<Decoder>),
775784
RepeatUntilLast(Expr, Box<Decoder>),
776785
RepeatUntilSeq(Expr, Box<Decoder>),
@@ -1310,6 +1319,7 @@ impl Format {
13101319
Format::RepeatCount(expr, f) => f.match_bounds(module) * expr.bounds(),
13111320
Format::RepeatUntilLast(_, f) => f.match_bounds(module) * Bounds::new(1, None),
13121321
Format::RepeatUntilSeq(_, _f) => Bounds::new(0, None),
1322+
Format::RepeatFallback(_narrow, _wide) => Bounds::new(0, None),
13131323
Format::Peek(_) => Bounds::exact(0),
13141324
Format::PeekNot(_) => Bounds::exact(0),
13151325
Format::Slice(expr, _) => expr.bounds(),
@@ -1354,6 +1364,7 @@ impl Format {
13541364
Format::Record(fields) => fields.iter().any(|(_, f)| f.depends_on_next(module)),
13551365
Format::Repeat(_) => true,
13561366
Format::Repeat1(_) => true,
1367+
Format::RepeatFallback(_narrow, _wide) => true,
13571368
Format::RepeatCount(_, _f) => false,
13581369
Format::RepeatUntilLast(_, _f) => false,
13591370
Format::RepeatUntilSeq(_, _f) => false,
@@ -1409,6 +1420,14 @@ impl Format {
14091420
}
14101421
}
14111422

1423+
pub fn is_char_format(&self, module: &FormatModule) -> bool {
1424+
match self {
1425+
// NOTE - currently only true for named formats matching `/.*char.*/`
1426+
Format::ItemVar(level, _args) => module.get_name(*level).contains("char"),
1427+
_ => false,
1428+
}
1429+
}
1430+
14121431
/// Returns `true` if values associated to this format should be handled as multi-character ASCII strings
14131432
pub fn is_ascii_string_format(&self, module: &FormatModule) -> bool {
14141433
match self {
@@ -1666,6 +1685,19 @@ impl<'a> MatchTreeStep<'a> {
16661685
Format::RepeatUntilSeq(_expr, _a) => {
16671686
Self::accept() // FIXME
16681687
}
1688+
Format::RepeatFallback(narrow, wide) => {
1689+
let tree = Self::add_next(module, next.clone());
1690+
tree.union(Self::add(
1691+
module,
1692+
narrow,
1693+
Rc::new(Next::Repeat(narrow, next.clone())),
1694+
))
1695+
.union(Self::add(
1696+
module,
1697+
wide,
1698+
Rc::new(Next::Repeat(wide, next.clone())),
1699+
))
1700+
}
16691701
Format::Peek(a) => {
16701702
let tree = Self::add_next(module, next.clone());
16711703
let peek = Self::add(module, a, Rc::new(Next::Empty));
@@ -2334,6 +2366,33 @@ impl Decoder {
23342366
Err(format!("cannot build match tree for {:?}", format))
23352367
}
23362368
}
2369+
Format::RepeatFallback(narrow, wide) => {
2370+
if narrow.is_nullable(compiler.module) || wide.is_nullable(compiler.module) {
2371+
return Err(format!(
2372+
"Cannot repeat nullable format: Repeat({narrow:?} ⊂ {wide:?})"
2373+
));
2374+
}
2375+
2376+
let dnarrow = Box::new(Decoder::compile_next(
2377+
compiler,
2378+
narrow,
2379+
Rc::new(Next::Repeat(narrow, next.clone())),
2380+
)?);
2381+
2382+
let dwide = Box::new(Decoder::compile_next(
2383+
compiler,
2384+
wide,
2385+
Rc::new(Next::Repeat(wide, next.clone())),
2386+
)?);
2387+
2388+
if let Some(tree) =
2389+
MatchTree::build(compiler.module, std::slice::from_ref(format), next)
2390+
{
2391+
Ok(Decoder::RepeatFallback(tree, dnarrow, dwide))
2392+
} else {
2393+
Err(format!("canot build match treee for {:?}", format))
2394+
}
2395+
}
23372396
Format::Repeat1(a) => {
23382397
if a.is_nullable(compiler.module) {
23392398
return Err(format!("cannot repeat nullable format: {a:?}"));
@@ -2545,6 +2604,44 @@ impl Decoder {
25452604
}
25462605
Ok((Value::Seq(v), input))
25472606
}
2607+
Decoder::RepeatFallback(_tree, subset, superset) => {
2608+
let mut accum = Vec::new();
2609+
let mut incr_input = input;
2610+
2611+
loop {
2612+
match subset.parse(program, scope, incr_input) {
2613+
Ok((b, next_input)) => {
2614+
accum.push(b);
2615+
incr_input = next_input;
2616+
}
2617+
Err(_err) => break,
2618+
}
2619+
}
2620+
2621+
// track the last offset we reached to see if we get any further
2622+
let checkpoint = incr_input.offset;
2623+
2624+
loop {
2625+
match superset.parse(program, scope, incr_input) {
2626+
Ok((b, next_input)) => {
2627+
accum.push(b);
2628+
incr_input = next_input;
2629+
}
2630+
Err(_err) => break,
2631+
}
2632+
}
2633+
2634+
if let Err(e) = (Decoder::EndOfInput).parse(program, scope, incr_input) {
2635+
return Err(e);
2636+
}
2637+
2638+
let fellthrough = incr_input.offset > checkpoint;
2639+
2640+
Ok((
2641+
Value::Fallback(fellthrough, Box::new(Value::Seq(accum))),
2642+
incr_input,
2643+
)) // index 1
2644+
}
25482645
Decoder::RepeatCount(expr, a) => {
25492646
let mut input = input;
25502647
let count = expr.eval_value(scope).unwrap_usize();

src/output/flat.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ fn check_covered(
155155
| Format::RepeatUntilSeq(_, format) => {
156156
check_covered(module, path, format)?;
157157
}
158+
Format::RepeatFallback(narrow, wide) => {
159+
check_covered(module, path, narrow)?;
160+
check_covered(module, path, wide)?;
161+
}
158162
Format::Peek(_) => {} // FIXME
159163
Format::PeekNot(_) => {} // FIXME
160164
Format::Slice(_, format) => {
@@ -260,6 +264,15 @@ impl<'module, W: io::Write> Context<'module, W> {
260264
}
261265
_ => panic!("expected sequence"),
262266
},
267+
Format::RepeatFallback(_narrow, _wide) => match value {
268+
Value::Seq(values) => {
269+
for _v in values {
270+
(); // FIXME
271+
}
272+
Ok(())
273+
}
274+
_ => panic!("expected sequence"),
275+
},
263276
Format::Peek(format) => self.write_flat(scope, value, format),
264277
Format::PeekNot(format) => self.write_flat(scope, value, format),
265278
Format::Slice(_, format) => self.write_flat(scope, value, format),

src/output/tree.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,13 @@ impl<'module> MonoidalPrinter<'module> {
120120
Value::Char(_) => true,
121121
Value::Bool(_) => true,
122122
Value::U8(_) | Value::U16(_) | Value::U32(_) => true,
123+
Value::Fallback(is_fallback, v) => match format {
124+
Some(Format::RepeatFallback(a, b)) => {
125+
let format = if *is_fallback { b } else { a };
126+
self.is_atomic_value(v, Some(format))
127+
}
128+
_ => self.is_atomic_value(v, None),
129+
},
123130
Value::Tuple(values) => values.is_empty(),
124131
Value::Record(fields) => fields.is_empty(),
125132
Value::Seq(values) => values.is_empty(),
@@ -250,6 +257,30 @@ impl<'module> MonoidalPrinter<'module> {
250257
}
251258
_ => panic!("expected sequence, found {value:?}"),
252259
},
260+
Format::RepeatFallback(narrow, wide) => match value {
261+
Value::Fallback(is_wide, v) => match v.deref() {
262+
Value::Seq(values) => {
263+
let format = if *is_wide { wide } else { narrow };
264+
if self.flags.tables_for_record_sequences
265+
&& self.is_record_with_atomic_fields(format).is_some()
266+
{
267+
self.compile_seq_records(values, format)
268+
} else if self.flags.pretty_ascii_strings
269+
&& format.is_ascii_char_format(self.module)
270+
{
271+
self.compile_ascii_seq(values)
272+
} else if self.flags.pretty_utf8_strings
273+
&& format.is_char_format(self.module)
274+
{
275+
self.compile_char_seq(values)
276+
} else {
277+
self.compile_seq(scope, values, Some(format))
278+
}
279+
}
280+
_ => panic!("expected sequence, found {v:?}"),
281+
},
282+
_ => panic!("expected Fallback, found {value:?}"),
283+
},
253284
Format::Peek(format) => self.compile_decoded_value(scope, value, format),
254285
Format::PeekNot(_format) => self.compile_value(scope, value),
255286
Format::Slice(_, format) => self.compile_decoded_value(scope, value, format),
@@ -315,6 +346,7 @@ impl<'module> MonoidalPrinter<'module> {
315346
Value::Seq(vals) => self.compile_seq(scope, vals, None),
316347
Value::Record(fields) => self.compile_record(scope, fields, None),
317348
Value::Variant(label, value) => self.compile_variant(scope, label, value, None),
349+
Value::Fallback(_, value) => self.compile_value(scope, value),
318350
Value::Mapped(orig, value) => {
319351
if self.flags.collapse_mapped_values {
320352
self.compile_value(scope, value)
@@ -1010,6 +1042,15 @@ impl<'module> MonoidalPrinter<'module> {
10101042
prec,
10111043
Precedence::FORMAT_COMPOUND,
10121044
),
1045+
Format::RepeatFallback(narrow, wide) => {
1046+
let wide_frag = self.compile_format(wide, Precedence::FORMAT_ATOM);
1047+
1048+
cond_paren(
1049+
self.compile_nested_format("repeat_fallback", Some(&[wide_frag]), narrow, prec),
1050+
prec,
1051+
Precedence::FORMAT_COMPOUND,
1052+
)
1053+
}
10131054
Format::Repeat(format) => cond_paren(
10141055
self.compile_nested_format("repeat", None, format, prec),
10151056
prec,

tests/expected/decode/test.txt.stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
├── data <- _ |...| _ :=
2-
│ └── text <- text.string := { ascii := "GIF89a is a popular format\n" }
2+
│ └── text <- text.string := "GIF89a is a popular format\n"
33
└── end <- end-of-input
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
├── data <- _ |...| _ :=
2-
│ └── text <- text.string :=
3-
│ └── utf8 <- text.string.utf8 := "この🦀は擂り身ではなく、本物のカニです。\n"
2+
│ └── text <- text.string := "この🦀は擂り身ではなく、本物のカニです。\n"
43
└── end <- end-of-input

0 commit comments

Comments
 (0)