Skip to content

Commit 6d883b5

Browse files
authored
Merge pull request #439 from dralley/encoding-module
Encoding module
2 parents c590fdf + c6fc0ba commit 6d883b5

File tree

10 files changed

+205
-176
lines changed

10 files changed

+205
-176
lines changed

Changelog.md

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
| |`resolve`
3838
|`event_namespace` |`resolve_element`
3939
|`attribute_namespace` |`resolve_attribute`
40+
- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
41+
under the `quick-xml::encoding` namespace.
4042

4143

4244
### Bug Fixes
@@ -209,6 +211,8 @@
209211
[#431]: https://github.com/tafia/quick-xml/pull/431
210212
[#434]: https://github.com/tafia/quick-xml/pull/434
211213
[#437]: https://github.com/tafia/quick-xml/pull/437
214+
[#439]: https://github.com/tafia/quick-xml/pull/439
215+
212216

213217
## 0.23.0 -- 2022-05-08
214218

src/de/escape.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
//! Serde `Deserializer` module
22
33
use crate::de::deserialize_bool;
4+
use crate::encoding::Decoder;
45
use crate::errors::serialize::DeError;
56
use crate::escape::unescape;
6-
use crate::reader::Decoder;
77
use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor};
88
use serde::{self, forward_to_deserialize_any, serde_if_integer128};
99
use std::borrow::Cow;

src/de/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,10 @@ mod var;
215215

216216
pub use crate::errors::serialize::DeError;
217217
use crate::{
218+
encoding::Decoder,
218219
errors::Error,
219220
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
220221
name::QName,
221-
reader::Decoder,
222222
Reader,
223223
};
224224
use serde::de::{self, Deserialize, DeserializeOwned, Visitor};

src/de/seq.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
2+
use crate::encoding::Decoder;
23
use crate::events::BytesStart;
3-
use crate::reader::Decoder;
44
use serde::de::{DeserializeSeed, SeqAccess};
55

66
/// Check if tag `start` is included in the `fields` list. `decoder` is used to

src/de/simple_type.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
//! [as defined]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition
55
66
use crate::de::{deserialize_bool, str2bool};
7+
use crate::encoding::Decoder;
78
use crate::errors::serialize::DeError;
89
use crate::escape::unescape;
9-
use crate::reader::Decoder;
1010
use memchr::memchr;
1111
use serde::de::{DeserializeSeed, Deserializer, EnumAccess, SeqAccess, VariantAccess, Visitor};
1212
use serde::{self, serde_if_integer128};

src/encoding.rs

+187
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
//! A module for wrappers that encode / decode data.
2+
3+
use std::borrow::Cow;
4+
5+
#[cfg(feature = "encoding")]
6+
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
7+
8+
use crate::{Error, Result};
9+
10+
/// Decoder of byte slices into strings.
11+
///
12+
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
13+
/// XML declaration or assumes UTF-8, if XML has no <?xml ?> declaration, encoding
14+
/// key is not defined or contains unknown encoding.
15+
///
16+
/// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
17+
/// is supported. [*UTF-16 is not supported at the present*][utf16].
18+
///
19+
/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder:
20+
/// any XML declarations are ignored.
21+
///
22+
/// [utf16]: https://github.com/tafia/quick-xml/issues/158
23+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
24+
pub struct Decoder {
25+
#[cfg(feature = "encoding")]
26+
pub(crate) encoding: &'static Encoding,
27+
}
28+
29+
impl Decoder {
30+
pub(crate) fn utf8() -> Self {
31+
Decoder {
32+
#[cfg(feature = "encoding")]
33+
encoding: UTF_8,
34+
}
35+
}
36+
37+
#[cfg(all(test, feature = "encoding", feature = "serialize"))]
38+
pub(crate) fn utf16() -> Self {
39+
Decoder { encoding: UTF_16LE }
40+
}
41+
}
42+
43+
#[cfg(not(feature = "encoding"))]
44+
impl Decoder {
45+
/// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if
46+
/// it is present in the `bytes`.
47+
///
48+
/// Returns an error in case of malformed sequences in the `bytes`.
49+
///
50+
/// If you instead want to use XML declared encoding, use the `encoding` feature
51+
#[inline]
52+
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
53+
Ok(Cow::Borrowed(std::str::from_utf8(bytes)?))
54+
}
55+
56+
/// Decodes a slice regardless of XML declaration with BOM removal if
57+
/// it is present in the `bytes`.
58+
///
59+
/// Returns an error in case of malformed sequences in the `bytes`.
60+
///
61+
/// If you instead want to use XML declared encoding, use the `encoding` feature
62+
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
63+
let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
64+
&bytes[3..]
65+
} else {
66+
bytes
67+
};
68+
self.decode(bytes)
69+
}
70+
}
71+
72+
#[cfg(feature = "encoding")]
73+
impl Decoder {
74+
/// Returns the `Reader`s encoding.
75+
///
76+
/// This encoding will be used by [`decode`].
77+
///
78+
/// [`decode`]: Self::decode
79+
pub fn encoding(&self) -> &'static Encoding {
80+
self.encoding
81+
}
82+
83+
/// Decodes specified bytes using encoding, declared in the XML, if it was
84+
/// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
85+
/// in the `bytes`.
86+
///
87+
/// Returns an error in case of malformed sequences in the `bytes`.
88+
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
89+
decode(bytes, self.encoding)
90+
}
91+
92+
/// Decodes a slice with BOM removal if it is present in the `bytes` using
93+
/// the reader encoding.
94+
///
95+
/// If this method called after reading XML declaration with the `"encoding"`
96+
/// key, then this encoding is used, otherwise UTF-8 is used.
97+
///
98+
/// If XML declaration is absent in the XML, UTF-8 is used.
99+
///
100+
/// Returns an error in case of malformed sequences in the `bytes`.
101+
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
102+
self.decode(remove_bom(bytes, self.encoding))
103+
}
104+
}
105+
106+
/// Decodes the provided bytes using the specified encoding, ignoring the BOM
107+
/// if it is present in the `bytes`.
108+
///
109+
/// Returns an error in case of malformed sequences in the `bytes`.
110+
#[cfg(feature = "encoding")]
111+
pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
112+
encoding
113+
.decode_without_bom_handling_and_without_replacement(bytes)
114+
.ok_or(Error::NonDecodable(None))
115+
}
116+
117+
/// Decodes a slice with an unknown encoding, removing the BOM if it is present
118+
/// in the bytes.
119+
///
120+
/// Returns an error in case of malformed sequences in the `bytes`.
121+
#[cfg(feature = "encoding")]
122+
pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
123+
if let Some(encoding) = detect_encoding(bytes) {
124+
let bytes = remove_bom(bytes, encoding);
125+
decode(bytes, encoding)
126+
} else {
127+
decode(bytes, UTF_8)
128+
}
129+
}
130+
131+
#[cfg(feature = "encoding")]
132+
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
133+
if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
134+
bytes.split_at(3)
135+
} else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
136+
bytes.split_at(2)
137+
} else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
138+
bytes.split_at(2)
139+
} else {
140+
(&[], bytes)
141+
}
142+
}
143+
144+
#[cfg(feature = "encoding")]
145+
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
146+
let (_, bytes) = split_at_bom(bytes, encoding);
147+
bytes
148+
}
149+
150+
/// Automatic encoding detection of XML files based using the
151+
/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
152+
///
153+
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
154+
///
155+
/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
156+
/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
157+
///
158+
/// The algorithm suggests examine up to the first 4 bytes to determine encoding
159+
/// according to the following table:
160+
///
161+
/// | Bytes |Detected encoding
162+
/// |-------------|------------------------------------------
163+
/// |`FE FF ## ##`|UTF-16, big-endian
164+
/// |`FF FE ## ##`|UTF-16, little-endian
165+
/// |`EF BB BF` |UTF-8
166+
/// |-------------|------------------------------------------
167+
/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
168+
/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
169+
/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
170+
#[cfg(feature = "encoding")]
171+
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
172+
match bytes {
173+
// with BOM
174+
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
175+
_ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE),
176+
_ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8),
177+
178+
// without BOM
179+
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
180+
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
181+
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible
182+
183+
_ => None,
184+
}
185+
}
186+
187+
// TODO: add some tests for functions

src/events/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ use std::fmt::{self, Debug, Formatter};
4343
use std::ops::Deref;
4444
use std::str::from_utf8;
4545

46+
use crate::encoding::Decoder;
4647
use crate::errors::{Error, Result};
4748
use crate::escape::{escape, partial_escape, unescape_with};
4849
use crate::name::{LocalName, QName};
49-
use crate::reader::Decoder;
5050
use crate::utils::write_cow_string;
5151
use attributes::{Attribute, Attributes};
5252

src/lib.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444

4545
#[cfg(feature = "serialize")]
4646
pub mod de;
47+
pub mod encoding;
4748
mod errors;
4849
mod escapei;
4950
pub mod escape {
@@ -62,8 +63,9 @@ pub mod utils;
6263
mod writer;
6364

6465
// reexports
66+
pub use crate::encoding::Decoder;
6567
#[cfg(feature = "serialize")]
6668
pub use crate::errors::serialize::DeError;
6769
pub use crate::errors::{Error, Result};
68-
pub use crate::reader::{Decoder, NsReader, Reader};
70+
pub use crate::reader::{NsReader, Reader};
6971
pub use crate::writer::{ElementWriter, Writer};

src/reader/buffered_reader.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ use std::fs::File;
55
use std::io::{self, BufRead, BufReader};
66
use std::path::Path;
77

8+
use memchr;
9+
810
use crate::errors::{Error, Result};
911
use crate::events::Event;
1012
use crate::name::QName;
1113
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource};
1214

13-
use memchr;
14-
1515
/// This is an implementation of [`Reader`] for reading from a [`BufRead`] as
1616
/// underlying byte stream.
1717
impl<R: BufRead> Reader<R> {

0 commit comments

Comments
 (0)