Skip to content

Commit

Permalink
Address multiple pending issues (#39)
Browse files Browse the repository at this point in the history
* Fixed handling of empty strings and ByteArrayConverter[] that could lead to null reference exceptions.

* Added support for Decimal in RowOriented API (tuples are not currently supported, due to the need for an attribute)

* Fixed BufferOutputStream.Finish().

* Added gitattributes

* Fixed mkdir in build script if directory already exists

* Don't force CRLF on cs files.
  • Loading branch information
GPSnoopy authored Jan 24, 2019
1 parent 3776486 commit d68b728
Show file tree
Hide file tree
Showing 11 changed files with 123 additions and 30 deletions.
11 changes: 11 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Set the default behavior, in case people don't have core.autocrlf set.
* text=auto

# Declare files that will always have CRLF line endings on checkout.
*.bat text eol=crlf
*.csproj text eol=crlf
*.sln text eol=crlf
*.targets text eol=crlf

# Declare files that will always have LF line endings on checkout.
*.sh text eol=lf
2 changes: 1 addition & 1 deletion cpp/BufferOutputStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ extern "C"
TRYCATCH(*output_stream = new std::shared_ptr<arrow::io::BufferOutputStream>(new arrow::io::BufferOutputStream(*resizableBuffer));)
}

PARQUETSHARP_EXPORT ExceptionInfo* BufferOutputStream_GetBuffer(const std::shared_ptr<arrow::io::BufferOutputStream>* output_stream, std::shared_ptr<arrow::Buffer>** buffer)
PARQUETSHARP_EXPORT ExceptionInfo* BufferOutputStream_Finish(const std::shared_ptr<arrow::io::BufferOutputStream>* output_stream, std::shared_ptr<arrow::Buffer>** buffer)
{
TRYCATCH
(
Expand Down
30 changes: 30 additions & 0 deletions csharp.test/TestBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,35 @@ public static unsafe void TestParquetReadFromBuffer()
Assert.AreEqual(expected, allData);
}
}

[Test]
public static void TestBufferOutputStreamFinish()
{
var expected = Enumerable.Range(0, 100).ToArray();

using (var outStream = new BufferOutputStream())
{
// Write out a single column
using (var fileWriter = new ParquetFileWriter(outStream, new Column[] {new Column<int>("int_field")}))
using (var rowGroupWriter = fileWriter.AppendRowGroup())
using (var colWriter = rowGroupWriter.NextColumn().LogicalWriter<int>())
{
colWriter.WriteBatch(expected);
}

// Read it back
using (var buffer = outStream.Finish())
{
using (var inStream = new BufferReader(buffer))
using (var fileReader = new ParquetFileReader(inStream))
using (var rowGroup = fileReader.RowGroup(0))
using (var columnReader = rowGroup.Column(0).LogicalReader<int>())
{
var allData = columnReader.ReadAll((int) rowGroup.MetaData.NumRows);
Assert.AreEqual(expected, allData);
}
}
}
}
}
}
25 changes: 15 additions & 10 deletions csharp.test/TestLogicalTypeRoundtrip.cs
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,10 @@ private static ExpectedColumn[] CreateExpectedColumns()
Name = "string_field",
PhysicalType = PhysicalType.ByteArray,
LogicalType = LogicalType.Utf8,
Values = Enumerable.Range(0, NumRows).Select(i => i % 9 == 0 ? null : $"Hello, {i}!").ToArray(),
NullCount = (NumRows + 8) / 9,
NumValues = NumRows - (NumRows + 8) / 9,
Min = "Hello, 1!",
Values = Enumerable.Range(0, NumRows).Select(i => i % 9 == 0 ? i % 18 == 0 ? null : "" : $"Hello, {i}!").ToArray(),
NullCount = (NumRows + 17) / 18,
NumValues = NumRows - (NumRows + 17) / 18,
Min = "",
Max = "Hello, 98!",
Converter = StringConverter
},
Expand All @@ -464,10 +464,10 @@ private static ExpectedColumn[] CreateExpectedColumns()
{
Name = "bytearray_field",
PhysicalType = PhysicalType.ByteArray,
Values = Enumerable.Range(0, NumRows).Select(i => i % 3 == 0 ? null : BitConverter.GetBytes(i)).ToArray(),
NullCount = (NumRows + 2) / 3,
NumValues = NumRows - (NumRows + 2) / 3,
Min = BitConverter.GetBytes(1),
Values = Enumerable.Range(0, NumRows).Select(i => i % 3 == 0 ? i % 6 == 0 ? null : new byte[0] : BitConverter.GetBytes(i)).ToArray(),
NullCount = (NumRows + 5) / 6,
NumValues = NumRows - (NumRows + 5) / 6,
Min = new byte[0],
Max = BitConverter.GetBytes(NumRows - 1),
Converter = ByteArrayConverter
},
Expand Down Expand Up @@ -622,14 +622,19 @@ private static object ByteArrayConverter(object v)
{
var byteArray = (ByteArray) v;
var array = new byte[byteArray.Length];
Marshal.Copy(byteArray.Pointer, array, 0, array.Length);
if (byteArray.Length != 0)
{
Marshal.Copy(byteArray.Pointer, array, 0, array.Length);
}
return array;
}

private static unsafe object StringConverter(object v)
{
var byteArray = (ByteArray) v;
return System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length);
return byteArray.Length == 0
? string.Empty
: System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length);
}

private sealed class ExpectedColumn
Expand Down
32 changes: 19 additions & 13 deletions csharp.test/TestRowOrientedParquetFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,30 +37,30 @@ public static void TestRoundtrip()
{
TestRoundtrip(new[]
{
new Row1 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10)},
new Row1 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16)},
new Row1 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02)}
(123, 3.14f, new DateTime(1981, 06, 10)),
(456, 1.27f, new DateTime(1987, 03, 16)),
(789, 6.66f, new DateTime(2018, 05, 02))
});

TestRoundtrip(new[]
{
new Row2 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10)},
new Row2 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16)},
new Row2 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02)}
Tuple.Create(123, 3.14f, new DateTime(1981, 06, 10)),
Tuple.Create(456, 1.27f, new DateTime(1987, 03, 16)),
Tuple.Create(789, 6.66f, new DateTime(2018, 05, 02))
});

TestRoundtrip(new[]
{
(123, 3.14f, new DateTime(1981, 06, 10)),
(456, 1.27f, new DateTime(1987, 03, 16)),
(789, 6.66f, new DateTime(2018, 05, 02))
new Row1 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10), D = 123.1M},
new Row1 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16), D = 456.12M},
new Row1 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02), D = 789.123M}
});

TestRoundtrip(new[]
{
Tuple.Create(123, 3.14f, new DateTime(1981, 06, 10)),
Tuple.Create(456, 1.27f, new DateTime(1987, 03, 16)),
Tuple.Create(789, 6.66f, new DateTime(2018, 05, 02))
new Row2 {A = 123, B = 3.14f, C = new DateTime(1981, 06, 10), D = 123.1M},
new Row2 {A = 456, B = 1.27f, C = new DateTime(1987, 03, 16), D = 456.12M},
new Row2 {A = 789, B = 6.66f, C = new DateTime(2018, 05, 02), D = 789.123M}
});
}

Expand Down Expand Up @@ -90,11 +90,14 @@ private sealed class Row1 : IEquatable<Row1>
public float B;
public DateTime C;

[ParquetDecimalScale(3)]
public decimal D;

public bool Equals(Row1 other)
{
if (ReferenceEquals(null, other)) return false;
if (ReferenceEquals(this, other)) return true;
return A == other.A && B.Equals(other.B) && C.Equals(other.C);
return A == other.A && B.Equals(other.B) && C.Equals(other.C) && D.Equals(other.D);
}
}

Expand All @@ -103,6 +106,9 @@ private struct Row2
public int A { get; set; }
public float B { get; set; }
public DateTime C { get; set; }

[ParquetDecimalScale(3)]
public decimal D { get; set; }
}

#if DUMP_EXPRESSION_TREES
Expand Down
1 change: 0 additions & 1 deletion csharp/IO/BufferOutputStream.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ private static IntPtr Create(ResizableBuffer buffer)
[DllImport(ParquetDll.Name)]
private static extern IntPtr BufferOutputStream_Create(out IntPtr outputStream);


[DllImport(ParquetDll.Name)]
private static extern IntPtr BufferOutputStream_Create_From_ResizableBuffer(IntPtr resizableBuffer, out IntPtr outputStream);

Expand Down
9 changes: 7 additions & 2 deletions csharp/LogicalRead.cs
Original file line number Diff line number Diff line change
Expand Up @@ -261,13 +261,18 @@ private static void ConvertByteArray(ReadOnlySpan<ByteArray> source, ReadOnlySpa

private static unsafe string ToString(ByteArray byteArray)
{
return System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length);
return byteArray.Length == 0
? string.Empty
: System.Text.Encoding.UTF8.GetString((byte*) byteArray.Pointer, byteArray.Length);
}

private static byte[] ToByteArray(ByteArray byteArray)
{
var array = new byte[byteArray.Length];
Marshal.Copy(byteArray.Pointer, array, 0, array.Length);
if (byteArray.Length != 0)
{
Marshal.Copy(byteArray.Pointer, array, 0, array.Length);
}
return array;
}

Expand Down
2 changes: 1 addition & 1 deletion csharp/ParquetSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
<NoWarn>1591;</NoWarn>
<Version>1.5.1.1-beta1</Version>
<Version>1.5.1.1-beta2</Version>
<Company>G-Research</Company>
<Authors>G-Research</Authors>
<Product>ParquetSharp</Product>
Expand Down
15 changes: 15 additions & 0 deletions csharp/RowOriented/ParquetDecimalScale.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
using System;

namespace ParquetSharp.RowOriented
{
[AttributeUsage(AttributeTargets.Field | AttributeTargets.Property)]
public sealed class ParquetDecimalScaleAttribute : Attribute
{
public ParquetDecimalScaleAttribute(int scale)
{
Scale = scale;
}

public readonly int Scale;
}
}
24 changes: 23 additions & 1 deletion csharp/RowOriented/ParquetFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ private static ParquetRowReader<TTuple>.ReadAction CreateReadDelegate<TTuple>()
private static (Column[] columns, ParquetRowWriter<TTuple>.WriteAction writeDelegate) CreateWriteDelegate<TTuple>()
{
var fields = GetFieldsAndProperties(typeof(TTuple), BindingFlags.Public | BindingFlags.Instance).ToArray();
var columns = fields.Select(f => new Column(f.type, f.name)).ToArray();
var columns = fields.Select(GetColumn).ToArray();

// Parameters
var writer = Expression.Parameter(typeof(ParquetRowWriter<TTuple>), "writer");
Expand Down Expand Up @@ -228,6 +228,28 @@ private static (string name, Type type, MemberInfo info)[] GetFieldsAndPropertie
return list.ToArray();
}

private static Column GetColumn((string name, Type type, MemberInfo info) field)
{
var isDecimal = field.type == typeof(decimal) || field.type == typeof(decimal?);
var decimalScale = field.info.GetCustomAttributes(typeof(ParquetDecimalScaleAttribute))
.Cast<ParquetDecimalScaleAttribute>()
.SingleOrDefault();

if (!isDecimal && decimalScale != null)
{
throw new ArgumentException($"field '{field.name}' has a {nameof(ParquetDecimalScaleAttribute)} despite not being a decimal type");
}

if (isDecimal && decimalScale == null)
{
throw new ArgumentException($"field '{field.name}' has no {nameof(ParquetDecimalScaleAttribute)} despite being a decimal type");
}

return isDecimal
? new ColumnDecimal(field.name, precision: 29, scale: decimalScale?.Scale ?? -1, isNullable: field.type == typeof(decimal?))
: new Column(field.type, field.name);
}

private static readonly ConcurrentDictionary<Type, Delegate> ReadDelegatesCache =
new ConcurrentDictionary<Type, Delegate>();

Expand Down
2 changes: 1 addition & 1 deletion vcpkg_windows.bat
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
mkdir build || goto :error
mkdir build
cd build || goto :error
git clone https://github.com/Microsoft/vcpkg.git vcpkg.windows || goto :error
cd vcpkg.windows || goto :error
Expand Down

0 comments on commit d68b728

Please sign in to comment.