ponylang · rowland66 · Jul 30, 2020 · Dec 1, 2020 · Dec 12, 2020 · Dec 12, 2020
diff --git a/packages/buffered/_test.pony b/packages/buffered/_test.pony
@@ -161,12 +161,14 @@ class iso _TestReader is UnitTest
     h.assert_eq[U128](b.u128_be()?, 0xDEADBEEFFEEDFACEDEADBEEFFEEDFACE)
     h.assert_eq[U128](b.u128_le()?, 0xDEADBEEFFEEDFACEDEADBEEFFEEDFACE)
 
-    h.assert_eq[String](b.line()?, "hi")
+    (var line: String val, _) = b.line()?
+    h.assert_eq[String]("hi", line)
     try
       b.read_until(0)?
       h.fail("should fail reading until 0")
     end
-    h.assert_eq[String](b.line()?, "there")
+    (line, _) = b.line()?
+    h.assert_eq[String]("there", line)
 
     b.append(['h'; 'i'])
 
@@ -179,7 +181,8 @@ class iso _TestReader is UnitTest
     h.assert_eq[U8](b.u8()?, 'i')
 
     b.append(['!'; '\n'])
-    h.assert_eq[String](b.line()?, "!")
+    (line, _) = b.line()?
+    h.assert_eq[String](line, "!")
 
     b.append(['s'; 't'; 'r'; '1'])
     try
@@ -221,9 +224,10 @@ class iso _TestWriter is UnitTest
       .> u128_le(0xDEADBEEFFEEDFACEDEADBEEFFEEDFACE)
 
     wb.write(['h'; 'i'])
-    wb.writev([
+    let chars: Array[ByteSeq] val = [
       ['\n'; 't'; 'h'; 'e']
-      ['r'; 'e'; '\r'; '\n']])
+      ['r'; 'e'; '\r'; '\n']]
+    wb.writev(chars)
 
     for bs in wb.done().values() do
       b.append(bs)
@@ -254,8 +258,10 @@ class iso _TestWriter is UnitTest
     h.assert_eq[U128](b.u128_be()?, 0xDEADBEEFFEEDFACEDEADBEEFFEEDFACE)
     h.assert_eq[U128](b.u128_le()?, 0xDEADBEEFFEEDFACEDEADBEEFFEEDFACE)
 
-    h.assert_eq[String](b.line()?, "hi")
-    h.assert_eq[String](b.line()?, "there")
+    (var line: String val, _) = b.line()?
+    h.assert_eq[String](line, "hi")
+    (line, _) = b.line()?
+    h.assert_eq[String](line, "there")
 
     b.append(['h'; 'i'])
 
@@ -265,4 +271,6 @@ class iso _TestWriter is UnitTest
     end
 
     b.append(['!'; '\n'])
-    h.assert_eq[String](b.line()?, "hi!")
+
+    (line, _) = b.line()?
+    h.assert_eq[String](line, "hi!")
diff --git a/packages/buffered/reader.pony b/packages/buffered/reader.pony
@@ -81,14 +81,8 @@ class Reader
     """
     Add a chunk of data.
     """
-    let data_array =
-      match data
-      | let data': Array[U8] val => data'
-      | let data': String => data'.array()
-      end
-
-    _available = _available + data_array.size()
-    _chunks.push((data_array, 0))
+    _available = _available + data.size()
+    _chunks.push((data, 0))
 
   fun ref skip(n: USize) ? =>
     """
@@ -167,16 +161,64 @@ class Reader
     u8()?
     b
 
-  fun ref line(keep_line_breaks: Bool = false): String iso^ ? =>
+  fun ref codepoint[D: StringDecoder = UTF8StringDecoder](): (U32, U8) ? =>
     """
-    Return a \n or \r\n terminated line as a string. By default the newline is not
+    Return a pair containing a unicode codepoint, and the number of bytes consumed to produce
+    the codepoint. Depending on how bytes are decoded into characters, the number of bytes consumed
+    may be greater than one. If the bytes cannot be converted to a codepoint, codepoint 0xFFFD
+    is returned, and 1 byte is consumed.
+    """
+    let decoder_bytes = StringDecoderBytes.create()
+    while (decoder_bytes.bytes_loaded() < 4) do
+      try
+        decoder_bytes.pushByte(peek_u8(decoder_bytes.bytes_loaded().usize())?)
+      else
+        if decoder_bytes.bytes_loaded() > 0 then
+          (let c, let sz) = D.decode(decoder_bytes.decode_bytes())
+          block(sz.usize())? // We ignore the bytes returned, but this will mark the bytes decoded into a character as consumed
+          return (c, sz)
+        else
+          error
+        end
+      end
+    end
+
+    try
+      (let c, let sz) = D.decode(decoder_bytes.decode_bytes())
+      block(sz.usize())? // We ignore the bytes returned, but this will mark the bytes decoded into a character as consumed
+      return (c, sz)
+    end
+    (0,0) // This should never happen
+
+  fun ref string[D: StringDecoder = UTF8StringDecoder](len: USize): (String iso^, USize) ? =>
+    """
+    Return a pair containing a string of the specified length in characters, and the number of bytes consumed
+    to produce the string. Depending on how bytes are decoded into characters, the number of bytes consumed
+    may be greater than the number of characters in the string. Invalid byte sequences may result in 0xFFFD
+    codepoints appearing in the string.
+    """
+    var chars_read: USize = 0
+    var bytes_read: USize = 0
+    var result: String iso = recover String(len) end
+    while (chars_read < len) do
+      (let c, let sz) = codepoint[D]()?
+      result.push(c)
+      chars_read = chars_read + 1
+      bytes_read = bytes_read + sz.usize()
+    end
+    (consume result, bytes_read)
+
+  fun ref line[D: StringDecoder = UTF8StringDecoder](keep_line_breaks: Bool = false): (String iso^, USize) ? =>
+    """
+    Return a pair containing a \n or \r\n terminated line as a string, and the number
+    of bytes consumed to produce the string.  By default the newline is not
     included in the returned string, but it is removed from the buffer.
     Set `keep_line_breaks` to `true` to keep the line breaks in the returned line.
     """
     let len = _search_length()?
 
     _available = _available - len
-    var out = recover String(len) end
+    var outb = recover Array[U8](len) end
     var i = USize(0)
 
     while i < len do
@@ -187,7 +229,7 @@ class Reader
       let need = len - i
       let copy_len = need.min(avail)
 
-      out.append(data, offset, copy_len)
+      outb.append(data, offset, copy_len)
 
       if avail > need then
         node()? = (data, offset + need)
@@ -201,14 +243,16 @@ class Reader
     let trunc_len: USize =
       if keep_line_breaks then
         0
-      elseif (len >= 2) and (out.at_offset(-2)? == '\r') then
+      elseif (len >= 2) and (outb.apply(outb.size()-2)? == '\r') then
         2
       else
         1
       end
-    out.truncate(len - trunc_len)
+    outb.truncate(len - trunc_len)
+
+    var out = recover String.from_iso_array[D](consume outb) end
 
-    consume out
+    (consume out, len)
 
   fun ref u8(): U8 ? =>
     """
@@ -758,6 +802,7 @@ class Reader
 
     error
 
+  // TODO: Fix to handle multi-byte sequences
   fun ref _distance_of(byte: U8): USize ? =>
     """
     Get the distance to the first occurrence of the given byte

diff --git a/packages/buffered/writer.pony b/packages/buffered/writer.pony
@@ -251,35 +251,51 @@ class Writer
     """
     u128_be(data.u128())
 
-  fun ref write(data: ByteSeq) =>
+  fun ref write[E: StringEncoder val = UTF8StringEncoder](data: (String | ByteSeq)) =>
     """
-    Write a ByteSeq to the buffer.
+    Write a String or a ByteSeq to the buffer. String characters will be converted to bytes using
+    the specified encoding (UTF-8 by default).
     """
+
     // if `data` is 1 cacheline or less in size
     // copy it into the existing `_current` array
     // to coalesce multiple tiny arrays
     // into a single bigger array
     if data.size() <= 64 then
       match data
       | let d: String =>
-         let a = d.array()
+         let a = d.array[E]()
          _current.copy_from(a, 0, _current.size(), a.size())
-      | let d: Array[U8] val =>
+         _size = _size + a.size()
+      | let d: ByteSeq =>
          _current.copy_from(d, 0, _current.size(), d.size())
+         _size = _size + data.size()
       end
-      _size = _size + data.size()
     else
       _append_current()
-      _chunks.push(data)
-      _size = _size + data.size()
+      match data
+      | let s: String =>
+        _chunks.push(s.array[E]())
+        _size = _size + s.byte_size()
+      | let d: ByteSeq =>
+        _chunks.push(d)
+        _size = _size + d.size()
+      end
     end
 
-  fun ref writev(data: ByteSeqIter) =>
+  fun ref writev[E: StringEncoder val = UTF8StringEncoder](data: (StringIter | ByteSeqIter)) =>
     """
-    Write ByteSeqs to the buffer.
+    Write Strings or ByteSeqs to the buffer.
     """
-    for chunk in data.values() do
-      write(chunk)
+    match data
+    | let si: StringIter =>
+      for chunk in si.values() do
+        write[E](chunk)
+      end
+    | let bsi: ByteSeqIter =>
+      for chunk in bsi.values() do
+        write(chunk)
+      end
     end
 
   fun ref done(): Array[ByteSeq] iso^ =>

diff --git a/packages/builtin/array.pony b/packages/builtin/array.pony
@@ -380,7 +380,12 @@ class Array[A] is Seq[A]
     Truncate an array to the given length, discarding excess elements. If the
     array is already smaller than len, do nothing.
     """
-    _size = _size.min(len)
+    if len >= _alloc then
+      _size = len.min(_alloc)
+      reserve(_alloc)
+    else
+      _size = len.min(_alloc)
+    end
 
   fun ref trim_in_place(from: USize = 0, to: USize = -1) =>
     """

diff --git a/packages/builtin/ascii_string_encoder.pony b/packages/builtin/ascii_string_encoder.pony
@@ -0,0 +1,18 @@
+primitive ASCIIStringEncoder is StringEncoder
+
+  fun encode(value: U32): (USize, U32) =>
+    if value < 0x80 then
+      return (1, value)
+    else
+      return (1, 0x3F)
+    end
+
+primitive ASCIIStringDecoder is StringDecoder
+
+  fun decode(b: U32): (U32, U8) =>
+    let byte = ((b and 0xFF000000) >> 24)
+    if (byte < 0x80) then
+      return (byte, 1)
+    else
+      (0xFFFD, 1)
+    end
diff --git a/packages/builtin/iso-8859-1_string_encoder.pony b/packages/builtin/iso-8859-1_string_encoder.pony
@@ -0,0 +1,13 @@
+primitive ISO88591StringEncoder is StringEncoder
+
+  fun encode(value: U32): (USize, U32) =>
+    if value < 0x100 then
+      return (1, value)
+    else
+      return (1, 0x3F)
+    end
+
+primitive ISO88591StringDecoder is StringDecoder
+
+  fun decode(b: U32): (U32, U8) =>
+    (((b and 0xFF000000) >> 24), 1)