@@ -125,13 +125,14 @@ Section: Creating a string
125125#[ stable( feature = "rust1" , since = "1.0.0" ) ]
126126pub struct Utf8Error {
127127 valid_up_to : usize ,
128+ invalid_length : Option < u8 > ,
128129}
129130
130131impl Utf8Error {
131132 /// Returns the index in the given string up to which valid UTF-8 was
132133 /// verified.
133134 ///
134- /// It is the maximum index such that `from_utf8(input[..index])`
135+ /// It is the maximum index such that `from_utf8(& input[..index])`
135136 /// would return `Ok(_)`.
136137 ///
137138 /// # Examples
@@ -152,6 +153,21 @@ impl Utf8Error {
152153 /// ```
153154 #[ stable( feature = "utf8_error" , since = "1.5.0" ) ]
154155 pub fn valid_up_to ( & self ) -> usize { self . valid_up_to }
156+
157+ /// Provide more information about the failure:
158+ ///
159+ /// * `None`: the end of the input was reached unexpectedly.
160+ /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
161+ /// If a byte stream (such as a file or a network socket) is being decoded incrementally,
162+ /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
163+ ///
164+ /// * `Some(index)`: an unexpected byte was encountered.
165+ /// The index provided is where decoding should resume
166+ /// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
167+ #[ unstable( feature = "utf8_error_resume_from" , reason ="new" , issue = "0" ) ]
168+ pub fn resume_from ( & self ) -> Option < usize > {
169+ self . invalid_length . map ( |l| self . valid_up_to + l as usize )
170+ }
155171}
156172
157173/// Converts a slice of bytes to a string slice.
@@ -300,7 +316,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
300316#[ stable( feature = "rust1" , since = "1.0.0" ) ]
301317impl fmt:: Display for Utf8Error {
302318 fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
303- write ! ( f, "invalid utf-8: invalid byte near index {}" , self . valid_up_to)
319+ if let Some ( invalid_length) = self . invalid_length {
320+ write ! ( f, "invalid utf-8 sequence of {} bytes from index {}" ,
321+ invalid_length, self . valid_up_to)
322+ } else {
323+ write ! ( f, "incomplete utf-8 byte sequence from index {}" , self . valid_up_to)
324+ }
304325 }
305326}
306327
@@ -1241,25 +1262,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12411262
12421263 while index < len {
12431264 let old_offset = index;
1244- macro_rules! err { ( ) => { {
1245- return Err ( Utf8Error {
1246- valid_up_to: old_offset
1247- } )
1248- } } }
1265+ macro_rules! err {
1266+ ( $invalid_length: expr) => {
1267+ return Err ( Utf8Error {
1268+ valid_up_to: old_offset,
1269+ invalid_length: $invalid_length,
1270+ } )
1271+ }
1272+ }
12491273
12501274 macro_rules! next { ( ) => { {
12511275 index += 1 ;
12521276 // we needed data, but there was none: error!
12531277 if index >= len {
1254- err!( )
1278+ err!( None )
12551279 }
12561280 v[ index]
12571281 } } }
12581282
12591283 let first = v[ index] ;
12601284 if first >= 128 {
12611285 let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1262- let second = next ! ( ) ;
12631286 // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
12641287 // first C2 80 last DF BF
12651288 // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
@@ -1279,25 +1302,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12791302 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
12801303 // %xF4 %x80-8F 2( UTF8-tail )
12811304 match w {
1282- 2 => if second & !CONT_MASK != TAG_CONT_U8 { err ! ( ) } ,
1305+ 2 => if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1306+ err ! ( Some ( 1 ) )
1307+ } ,
12831308 3 => {
1284- match ( first, second, next ! ( ) & !CONT_MASK ) {
1285- ( 0xE0 , 0xA0 ... 0xBF , TAG_CONT_U8 ) |
1286- ( 0xE1 ... 0xEC , 0x80 ... 0xBF , TAG_CONT_U8 ) |
1287- ( 0xED , 0x80 ... 0x9F , TAG_CONT_U8 ) |
1288- ( 0xEE ... 0xEF , 0x80 ... 0xBF , TAG_CONT_U8 ) => { }
1289- _ => err ! ( )
1309+ match ( first, next ! ( ) ) {
1310+ ( 0xE0 , 0xA0 ... 0xBF ) |
1311+ ( 0xE1 ... 0xEC , 0x80 ... 0xBF ) |
1312+ ( 0xED , 0x80 ... 0x9F ) |
1313+ ( 0xEE ... 0xEF , 0x80 ... 0xBF ) => { }
1314+ _ => err ! ( Some ( 1 ) )
1315+ }
1316+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1317+ err ! ( Some ( 2 ) )
12901318 }
12911319 }
12921320 4 => {
1293- match ( first, second, next ! ( ) & !CONT_MASK , next ! ( ) & !CONT_MASK ) {
1294- ( 0xF0 , 0x90 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1295- ( 0xF1 ... 0xF3 , 0x80 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1296- ( 0xF4 , 0x80 ... 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => { }
1297- _ => err ! ( )
1321+ match ( first, next ! ( ) ) {
1322+ ( 0xF0 , 0x90 ... 0xBF ) |
1323+ ( 0xF1 ... 0xF3 , 0x80 ... 0xBF ) |
1324+ ( 0xF4 , 0x80 ... 0x8F ) => { }
1325+ _ => err ! ( Some ( 1 ) )
1326+ }
1327+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1328+ err ! ( Some ( 2 ) )
1329+ }
1330+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1331+ err ! ( Some ( 3 ) )
12981332 }
12991333 }
1300- _ => err ! ( )
1334+ _ => err ! ( Some ( 1 ) )
13011335 }
13021336 index += 1 ;
13031337 } else {
0 commit comments