Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
29b58d0
feat(std.zon): add escape_unicode options to zon.serializer
nurulhudaapon Apr 17, 2025
066ea42
wip: make escape_unicode = false by default
nurulhudaapon May 9, 2025
7f6586b
fix: escape ",\r,\n,\t,\
nurulhudaapon May 13, 2025
b9783ef
fix: make emit_codepoint_literals = .always emit double quoted hex co…
nurulhudaapon May 21, 2025
a277bc8
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon May 21, 2025
3cc48cc
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Jun 2, 2025
e8dacd5
Revert "fix: make emit_codepoint_literals = .always emit double quote…
nurulhudaapon Jun 4, 2025
758be84
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Jun 4, 2025
6de0089
fix: always escape some char, move escaping logic and re-use in codep…
nurulhudaapon Jun 6, 2025
f16c6bb
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Jun 6, 2025
a3b9f3e
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Jun 6, 2025
0144c53
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Jun 12, 2025
7761c02
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Jun 16, 2025
7cb67ed
fix: code cleanup and single/double quote handling
nurulhudaapon Oct 22, 2025
344237a
fix: proper double/single quote handling and code cleanup
nurulhudaapon Oct 22, 2025
bd3e061
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Oct 22, 2025
b12fbd0
refactor: rename codepoint to val for consistency
nurulhudaapon Oct 22, 2025
a57a46d
Merge branch 'master' into zon/serializer-unicode-escaping
nurulhudaapon Oct 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 108 additions & 6 deletions lib/std/zon/Serializer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ pub const ValueOptions = struct {
emit_codepoint_literals: EmitCodepointLiterals = .never,
emit_strings_as_containers: bool = false,
emit_default_optional_fields: bool = true,
escape_non_ascii: bool = false,
};

/// Determines when to emit Unicode code point literals as opposed to integer literals.
Expand Down Expand Up @@ -125,7 +126,7 @@ pub fn valueArbitraryDepth(self: *Serializer, val: anytype, options: ValueOption
comptime assert(canSerializeType(@TypeOf(val)));
switch (@typeInfo(@TypeOf(val))) {
.int, .comptime_int => if (options.emit_codepoint_literals.emitAsCodepoint(val)) |c| {
self.codePoint(c) catch |err| switch (err) {
self.codePoint(c, .{ .escape_non_ascii = options.escape_non_ascii }) catch |err| switch (err) {
error.InvalidCodepoint => unreachable, // Already validated
else => |e| return e,
};
Expand All @@ -146,7 +147,7 @@ pub fn valueArbitraryDepth(self: *Serializer, val: anytype, options: ValueOption
(pointer.sentinel() == null or pointer.sentinel() == 0) and
!options.emit_strings_as_containers)
{
return try self.string(val);
return try self.string(val, .{ .escape_non_ascii = options.escape_non_ascii });
}

// Serialize as either a tuple or as the child type
Expand Down Expand Up @@ -280,12 +281,21 @@ pub fn ident(self: *Serializer, name: []const u8) Error!void {
}

pub const CodePointError = Error || error{InvalidCodepoint};
/// Options for formatting code points.
pub const CodePointOptions = struct {
escape_non_ascii: bool = false,
};

/// Serialize `val` as a Unicode codepoint.
///
/// Returns `error.InvalidCodepoint` if `val` is not a valid Unicode codepoint.
pub fn codePoint(self: *Serializer, val: u21) CodePointError!void {
try self.writer.print("'{f}'", .{std.zig.fmtChar(val)});
pub fn codePoint(self: *Serializer, val: u21, options: CodePointOptions) CodePointError!void {
try self.writer.writeByte('\'');
try self.writeCodepoint(val, .{
.escape_non_ascii = options.escape_non_ascii,
.quote_style = .single,
});
try self.writer.writeByte('\'');
}

/// Like `value`, but always serializes `val` as a tuple.
Expand Down Expand Up @@ -341,9 +351,101 @@ fn tupleImpl(self: *Serializer, val: anytype, options: ValueOptions) Error!void
}
}

/// Options for writing a Unicode codepoint.
const WriteCodepointOptions = struct {
escape_non_ascii: bool = false,
/// If single quote style then single quotes are escaped, otherwise double quotes are escaped.
quote_style: enum { single, double } = .single,
};

/// Write a Unicode codepoint to the writer using the given options.
///
/// Returns `error.InvalidCodepoint` if `codepoint` is not a valid Unicode codepoint.
fn writeCodepoint(self: *Serializer, val: u21, options: WriteCodepointOptions) CodePointError!void {
switch (val) {
// Printable ASCII
' ', '!', '#'...'&', '('...'[', ']'...'~' => try self.writer.writeByte(@intCast(val)),
// Unprintable ASCII
0x00...0x08, 0x0B, 0x0C, 0x0E...0x1F, 0x7F => try self.writer.print("\\x{x:0>2}", .{val}),
// ASCII with special escapes
'\n' => try self.writer.writeAll("\\n"),
'\r' => try self.writer.writeAll("\\r"),
'\t' => try self.writer.writeAll("\\t"),
'\\' => try self.writer.writeAll("\\\\"),
// Quotes need escaping if they conflict with the in-use quote character
'\'' => if (options.quote_style == .single) try self.writer.writeAll("\\'") else try self.writer.writeByte('\''),
'\"' => if (options.quote_style == .double) try self.writer.writeAll("\\\"") else try self.writer.writeByte('"'),
// Non-ASCII but still one byte
0x80...0xFF => if (options.escape_non_ascii) {
try self.writer.print("\\x{x:0>2}", .{val});
} else {
try self.writer.writeByte(@intCast(val));
},

// Surrogates can only be written with an escape
0xD800...0xDFFF => try self.writer.print("\\u{{{x}}}", .{val}),
// Other valid codepoints
0x100...0xD7FF, 0xE000...0x10FFFF => if (options.escape_non_ascii) {
try self.writer.print("\\u{{{x}}}", .{val});
} else {
var buf: [7]u8 = undefined;
const len = std.unicode.utf8Encode(val, &buf) catch unreachable;
try self.writer.writeAll(buf[0..len]);
},
// Invalid codepoints
0x110000...std.math.maxInt(u21) => return error.InvalidCodepoint,
}
}

pub const StringOptions = struct {
escape_non_ascii: bool = false,
};

/// Like `value`, but always serializes `val` as a string.
pub fn string(self: *Serializer, val: []const u8) Error!void {
try self.writer.print("\"{f}\"", .{std.zig.fmtString(val)});
pub fn string(self: *Serializer, val: []const u8, options: StringOptions) Writer.Error!void {
try self.writer.writeByte('"');
// Batch write sequences of "raw" bytes (printable ASCII or non-escaped non-ASCII) for performance.
// `val[start..i]` contains pending raw bytes to write.
var start: usize = 0;
var i: usize = 0;
while (i < val.len) {
const byte = val[i];
// Check if this byte can be written as-is
const is_raw = switch (byte) {
' ', '!', '#'...'[', ']'...'~' => true,
0x80...0xFF => !options.escape_non_ascii,
else => false,
};
if (is_raw) {
i += 1;
continue;
}
// Flush pending raw bytes
try self.writer.writeAll(val[start..i]);
// Handle the special character
if (byte >= 0x80) {
// Decode UTF-8 sequence and write the codepoint
const ulen = std.unicode.utf8ByteSequenceLength(byte) catch unreachable;
const codepoint = std.unicode.utf8Decode(val[i..][0..ulen]) catch unreachable;
// InvalidCodepoint cannot occur from valid UTF-8
self.writeCodepoint(codepoint, .{
.escape_non_ascii = options.escape_non_ascii,
.quote_style = .double,
}) catch unreachable;
i += ulen;
} else {
// ASCII character that needs escaping
self.writeCodepoint(byte, .{
.escape_non_ascii = options.escape_non_ascii,
.quote_style = .double,
}) catch unreachable; // InvalidCodepoint cannot occur for valid ASCII values
i += 1;
}
start = i;
}

try self.writer.writeAll(val[start..]);
try self.writer.writeByte('"');
}

/// Options for formatting multiline strings.
Expand Down
30 changes: 17 additions & 13 deletions lib/std/zon/stringify.zig
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
const std = @import("std");
const assert = std.debug.assert;
const Writer = std.Io.Writer;
const Serializer = std.zon.Serializer;
const Serializer = @import("Serializer.zig");

pub const SerializeOptions = struct {
/// If false, whitespace is omitted. Otherwise whitespace is emitted in standard Zig style.
Expand All @@ -37,6 +37,8 @@ pub const SerializeOptions = struct {
/// If false, struct fields are not written if they are equal to their default value. Comparison
/// is done by `std.meta.eql`.
emit_default_optional_fields: bool = true,
/// If true, non-ASCII unicode characters are escaped.
escape_non_ascii: bool = false,
};

/// Serialize the given value as ZON.
Expand All @@ -51,6 +53,7 @@ pub fn serialize(val: anytype, options: SerializeOptions, writer: *Writer) Write
.emit_codepoint_literals = options.emit_codepoint_literals,
.emit_strings_as_containers = options.emit_strings_as_containers,
.emit_default_optional_fields = options.emit_default_optional_fields,
.escape_non_ascii = options.escape_non_ascii,
});
}

Expand All @@ -72,6 +75,7 @@ pub fn serializeMaxDepth(
.emit_codepoint_literals = options.emit_codepoint_literals,
.emit_strings_as_containers = options.emit_strings_as_containers,
.emit_default_optional_fields = options.emit_default_optional_fields,
.escape_non_ascii = options.escape_non_ascii,
}, depth);
}

Expand All @@ -91,6 +95,7 @@ pub fn serializeArbitraryDepth(
.emit_codepoint_literals = options.emit_codepoint_literals,
.emit_strings_as_containers = options.emit_strings_as_containers,
.emit_default_optional_fields = options.emit_default_optional_fields,
.escape_non_ascii = options.escape_non_ascii,
});
}

Expand Down Expand Up @@ -588,7 +593,7 @@ test "std.zon stringify utf8 codepoints" {
try std.testing.expectEqualStrings("97", aw.written());
aw.clearRetainingCapacity();

try s.codePoint('a');
try s.codePoint('a', .{});
try std.testing.expectEqualStrings("'a'", aw.written());
aw.clearRetainingCapacity();

Expand All @@ -609,7 +614,7 @@ test "std.zon stringify utf8 codepoints" {
try std.testing.expectEqualStrings("10", aw.written());
aw.clearRetainingCapacity();

try s.codePoint('\n');
try s.codePoint('\n', .{});
try std.testing.expectEqualStrings("'\\n'", aw.written());
aw.clearRetainingCapacity();

Expand All @@ -630,11 +635,11 @@ test "std.zon stringify utf8 codepoints" {
try std.testing.expectEqualStrings("9889", aw.written());
aw.clearRetainingCapacity();

try s.codePoint('⚡');
try s.codePoint('⚡', .{ .escape_non_ascii = true });
try std.testing.expectEqualStrings("'\\u{26a1}'", aw.written());
aw.clearRetainingCapacity();

try s.value('⚡', .{ .emit_codepoint_literals = .always });
try s.value('⚡', .{ .emit_codepoint_literals = .always, .escape_non_ascii = true });
try std.testing.expectEqualStrings("'\\u{26a1}'", aw.written());
aw.clearRetainingCapacity();

Expand All @@ -647,8 +652,7 @@ test "std.zon stringify utf8 codepoints" {
aw.clearRetainingCapacity();

// Invalid codepoint
try s.codePoint(0x110000 + 1);
try std.testing.expectEqualStrings("'\\u{110001}'", aw.written());
try std.testing.expectError(error.InvalidCodepoint, s.codePoint(0x110000 + 1, .{ .escape_non_ascii = true }));
aw.clearRetainingCapacity();

try s.int(0x110000 + 1);
Expand Down Expand Up @@ -681,7 +685,7 @@ test "std.zon stringify utf8 codepoints" {
aw.clearRetainingCapacity();

// Make sure value options are passed to children
try s.value(.{ .c = '⚡' }, .{ .emit_codepoint_literals = .always });
try s.value(.{ .c = '⚡' }, .{ .emit_codepoint_literals = .always, .escape_non_ascii = true });
try std.testing.expectEqualStrings(".{ .c = '\\u{26a1}' }", aw.written());
aw.clearRetainingCapacity();

Expand All @@ -696,8 +700,8 @@ test "std.zon stringify strings" {
defer aw.deinit();

// Minimal case
try s.string("abc⚡\n");
try std.testing.expectEqualStrings("\"abc\\xe2\\x9a\\xa1\\n\"", aw.written());
try s.string("abc⚡\n", .{ .escape_non_ascii = true });
try std.testing.expectEqualStrings("\"abc\\u{26a1}\\n\"", aw.written());
aw.clearRetainingCapacity();

try s.tuple("abc⚡\n", .{});
Expand All @@ -714,8 +718,8 @@ test "std.zon stringify strings" {
, aw.written());
aw.clearRetainingCapacity();

try s.value("abc⚡\n", .{});
try std.testing.expectEqualStrings("\"abc\\xe2\\x9a\\xa1\\n\"", aw.written());
try s.value("abc⚡\n", .{ .escape_non_ascii = false });
try std.testing.expectEqualStrings("\"abc\\n\"", aw.written());
aw.clearRetainingCapacity();

try s.value("abc⚡\n", .{ .emit_strings_as_containers = true });
Expand Down Expand Up @@ -816,7 +820,7 @@ test "std.zon stringify multiline strings" {

{
const str: []const u8 = &.{ 'a', '\r', 'c' };
try s.string(str);
try s.string(str, .{ .escape_non_ascii = false });
try std.testing.expectEqualStrings("\"a\\rc\"", aw.written());
aw.clearRetainingCapacity();
}
Expand Down