diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..53ecd5f9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,11 @@ +# Set the default behavior, in case people don't have core.autocrlf set. +* text=auto + +# Declare files that will always have CRLF line endings on checkout. +*.bat text eol=crlf +*.csproj text eol=crlf +*.sln text eol=crlf +*.targets text eol=crlf + +# Declare files that will always have LF line endings on checkout. +*.sh text eol=lf diff --git a/cpp/BufferOutputStream.cpp b/cpp/BufferOutputStream.cpp index 862e96c8..91bf8059 100644 --- a/cpp/BufferOutputStream.cpp +++ b/cpp/BufferOutputStream.cpp @@ -23,7 +23,7 @@ extern "C" TRYCATCH(*output_stream = new std::shared_ptr(new arrow::io::BufferOutputStream(*resizableBuffer));) } - PARQUETSHARP_EXPORT ExceptionInfo* BufferOutputStream_GetBuffer(const std::shared_ptr* output_stream, std::shared_ptr** buffer) + PARQUETSHARP_EXPORT ExceptionInfo* BufferOutputStream_Finish(const std::shared_ptr* output_stream, std::shared_ptr** buffer) { TRYCATCH ( diff --git a/csharp.test/TestBuffer.cs b/csharp.test/TestBuffer.cs index 7bea218e..ff58ce7e 100644 --- a/csharp.test/TestBuffer.cs +++ b/csharp.test/TestBuffer.cs @@ -52,5 +52,35 @@ public static unsafe void TestParquetReadFromBuffer() Assert.AreEqual(expected, allData); } } + + [Test] + public static void TestBufferOutputStreamFinish() + { + var expected = Enumerable.Range(0, 100).ToArray(); + + using (var outStream = new BufferOutputStream()) + { + // Write out a single column + using (var fileWriter = new ParquetFileWriter(outStream, new Column[] {new Column("int_field")})) + using (var rowGroupWriter = fileWriter.AppendRowGroup()) + using (var colWriter = rowGroupWriter.NextColumn().LogicalWriter()) + { + colWriter.WriteBatch(expected); + } + + // Read it back + using (var buffer = outStream.Finish()) + { + using (var inStream = new BufferReader(buffer)) + using (var fileReader = new ParquetFileReader(inStream)) + using (var rowGroup = fileReader.RowGroup(0)) + using (var columnReader = rowGroup.Column(0).LogicalReader()) + { + var allData = columnReader.ReadAll((int) rowGroup.MetaData.NumRows); + Assert.AreEqual(expected, allData); + } + } + } + } } } diff --git a/csharp.test/TestLogicalTypeRoundtrip.cs b/csharp.test/TestLogicalTypeRoundtrip.cs index e7a15e74..7bdd732c 100644 --- a/csharp.test/TestLogicalTypeRoundtrip.cs +++ b/csharp.test/TestLogicalTypeRoundtrip.cs @@ -440,10 +440,10 @@ private static ExpectedColumn[] CreateExpectedColumns() Name = "string_field", PhysicalType = PhysicalType.ByteArray, LogicalType = LogicalType.Utf8, - Values = Enumerable.Range(0, NumRows).Select(i => i % 9 == 0 ? null : $"Hello, {i}!").ToArray(), - NullCount = (NumRows + 8) / 9, - NumValues = NumRows - (NumRows + 8) / 9, - Min = "Hello, 1!", + Values = Enumerable.Range(0, NumRows).Select(i => i % 9 == 0 ? i % 18 == 0 ? null : "" : $"Hello, {i}!").ToArray(), + NullCount = (NumRows + 17) / 18, + NumValues = NumRows - (NumRows + 17) / 18, + Min = "", Max = "Hello, 98!", Converter = StringConverter }, @@ -464,10 +464,10 @@ private static ExpectedColumn[] CreateExpectedColumns() { Name = "bytearray_field", PhysicalType = PhysicalType.ByteArray, - Values = Enumerable.Range(0, NumRows).Select(i => i % 3 == 0 ? null : BitConverter.GetBytes(i)).ToArray(), - NullCount = (NumRows + 2) / 3, - NumValues = NumRows - (NumRows + 2) / 3, - Min = BitConverter.GetBytes(1), + Values = Enumerable.Range(0, NumRows).Select(i => i % 3 == 0 ? i % 6 == 0 ? null : new byte[0] : BitConverter.GetBytes(i)).ToArray(), + NullCount = (NumRows + 5) / 6, + NumValues = NumRows - (NumRows + 5) / 6, + Min = new byte[0], Max = BitConverter.GetBytes(NumRows - 1), Converter = ByteArrayConverter }, @@ -622,14 +622,19 @@ private static object ByteArrayConverter(object v) { var byteArray = (ByteArray) v; var array = new byte[byteArray.Length]; - Marshal.Copy(byteArray.Pointer, array, 0, array.Length); + if (byteArray.Length != 0) + { + Marshal.Copy(byteArray.Pointer, array, 0, array.Length); + } return array; } private static unsafe object StringConverter(object v) { var byteArray = (ByteArray) v; - return System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length); + return byteArray.Length == 0 + ? string.Empty + : System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length); } private sealed class ExpectedColumn diff --git a/csharp.test/TestRowOrientedParquetFile.cs b/csharp.test/TestRowOrientedParquetFile.cs index 43c31605..d4bc6b90 100644 --- a/csharp.test/TestRowOrientedParquetFile.cs +++ b/csharp.test/TestRowOrientedParquetFile.cs @@ -37,30 +37,30 @@ public static void TestRoundtrip() { TestRoundtrip(new[] { - new Row1 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10)}, - new Row1 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16)}, - new Row1 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02)} + (123, 3.14f, new DateTime(1981, 06, 10)), + (456, 1.27f, new DateTime(1987, 03, 16)), + (789, 6.66f, new DateTime(2018, 05, 02)) }); TestRoundtrip(new[] { - new Row2 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10)}, - new Row2 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16)}, - new Row2 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02)} + Tuple.Create(123, 3.14f, new DateTime(1981, 06, 10)), + Tuple.Create(456, 1.27f, new DateTime(1987, 03, 16)), + Tuple.Create(789, 6.66f, new DateTime(2018, 05, 02)) }); TestRoundtrip(new[] { - (123, 3.14f, new DateTime(1981, 06, 10)), - (456, 1.27f, new DateTime(1987, 03, 16)), - (789, 6.66f, new DateTime(2018, 05, 02)) + new Row1 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10), D = 123.1M}, + new Row1 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16), D = 456.12M}, + new Row1 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02), D = 789.123M} }); TestRoundtrip(new[] { - Tuple.Create(123, 3.14f, new DateTime(1981, 06, 10)), - Tuple.Create(456, 1.27f, new DateTime(1987, 03, 16)), - Tuple.Create(789, 6.66f, new DateTime(2018, 05, 02)) + new Row2 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10), D = 123.1M}, + new Row2 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16), D = 456.12M}, + new Row2 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02), D = 789.123M} }); } @@ -90,11 +90,14 @@ private sealed class Row1 : IEquatable public float B; public DateTime C; + [ParquetDecimalScale(3)] + public decimal D; + public bool Equals(Row1 other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; - return A == other.A && B.Equals(other.B) && C.Equals(other.C); + return A == other.A && B.Equals(other.B) && C.Equals(other.C) && D.Equals(other.D); } } @@ -103,6 +106,9 @@ private struct Row2 public int A { get; set; } public float B { get; set; } public DateTime C { get; set; } + + [ParquetDecimalScale(3)] + public decimal D { get; set; } } #if DUMP_EXPRESSION_TREES diff --git a/csharp/IO/BufferOutputStream.cs b/csharp/IO/BufferOutputStream.cs index 1a4680f7..9c6d2471 100644 --- a/csharp/IO/BufferOutputStream.cs +++ b/csharp/IO/BufferOutputStream.cs @@ -40,7 +40,6 @@ private static IntPtr Create(ResizableBuffer buffer) [DllImport(ParquetDll.Name)] private static extern IntPtr BufferOutputStream_Create(out IntPtr outputStream); - [DllImport(ParquetDll.Name)] private static extern IntPtr BufferOutputStream_Create_From_ResizableBuffer(IntPtr resizableBuffer, out IntPtr outputStream); diff --git a/csharp/LogicalRead.cs b/csharp/LogicalRead.cs index 1e139844..3c4298c8 100644 --- a/csharp/LogicalRead.cs +++ b/csharp/LogicalRead.cs @@ -261,13 +261,18 @@ private static void ConvertByteArray(ReadOnlySpan source, ReadOnlySpa private static unsafe string ToString(ByteArray byteArray) { - return System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length); + return byteArray.Length == 0 + ? string.Empty + : System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length); } private static byte[] ToByteArray(ByteArray byteArray) { var array = new byte[byteArray.Length]; - Marshal.Copy(byteArray.Pointer, array, 0, array.Length); + if (byteArray.Length != 0) + { + Marshal.Copy(byteArray.Pointer, array, 0, array.Length); + } return array; } diff --git a/csharp/ParquetSharp.csproj b/csharp/ParquetSharp.csproj index f0e46d30..56c6ff71 100644 --- a/csharp/ParquetSharp.csproj +++ b/csharp/ParquetSharp.csproj @@ -11,7 +11,7 @@ true true 1591; - 1.5.1.1-beta1 + 1.5.1.1-beta2 G-Research G-Research ParquetSharp diff --git a/csharp/RowOriented/ParquetDecimalScale.cs b/csharp/RowOriented/ParquetDecimalScale.cs new file mode 100644 index 00000000..75f8bfd3 --- /dev/null +++ b/csharp/RowOriented/ParquetDecimalScale.cs @@ -0,0 +1,15 @@ +using System; + +namespace ParquetSharp.RowOriented +{ + [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property)] + public sealed class ParquetDecimalScaleAttribute : Attribute + { + public ParquetDecimalScaleAttribute(int scale) + { + Scale = scale; + } + + public readonly int Scale; + } +} diff --git a/csharp/RowOriented/ParquetFile.cs b/csharp/RowOriented/ParquetFile.cs index 787aeba9..09ae62e7 100644 --- a/csharp/RowOriented/ParquetFile.cs +++ b/csharp/RowOriented/ParquetFile.cs @@ -127,7 +127,7 @@ private static ParquetRowReader.ReadAction CreateReadDelegate() private static (Column[] columns, ParquetRowWriter.WriteAction writeDelegate) CreateWriteDelegate() { var fields = GetFieldsAndProperties(typeof(TTuple), BindingFlags.Public | BindingFlags.Instance).ToArray(); - var columns = fields.Select(f => new Column(f.type, f.name)).ToArray(); + var columns = fields.Select(GetColumn).ToArray(); // Parameters var writer = Expression.Parameter(typeof(ParquetRowWriter), "writer"); @@ -228,6 +228,28 @@ private static (string name, Type type, MemberInfo info)[] GetFieldsAndPropertie return list.ToArray(); } + private static Column GetColumn((string name, Type type, MemberInfo info) field) + { + var isDecimal = field.type == typeof(decimal) || field.type == typeof(decimal?); + var decimalScale = field.info.GetCustomAttributes(typeof(ParquetDecimalScaleAttribute)) + .Cast() + .SingleOrDefault(); + + if (!isDecimal && decimalScale != null) + { + throw new ArgumentException($"field '{field.name}' has a {nameof(ParquetDecimalScaleAttribute)} despite not being a decimal type"); + } + + if (isDecimal && decimalScale == null) + { + throw new ArgumentException($"field '{field.name}' has no {nameof(ParquetDecimalScaleAttribute)} despite being a decimal type"); + } + + return isDecimal + ? new ColumnDecimal(field.name, precision: 29, scale: decimalScale?.Scale ?? -1, isNullable: field.type == typeof(decimal?)) + : new Column(field.type, field.name); + } + private static readonly ConcurrentDictionary ReadDelegatesCache = new ConcurrentDictionary(); diff --git a/vcpkg_windows.bat b/vcpkg_windows.bat index f8ab8d60..1b32160c 100644 --- a/vcpkg_windows.bat +++ b/vcpkg_windows.bat @@ -1,4 +1,4 @@ -mkdir build || goto :error +mkdir build cd build || goto :error git clone https://github.com/Microsoft/vcpkg.git vcpkg.windows || goto :error cd vcpkg.windows || goto :error