Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate latest dataframe changes from asmirnov/machinelearning into local branch #2

Merged
merged 18 commits into from
Jun 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
e7f9daf
Fix using ToList on Row Collection
asmirnov82 May 15, 2023
f4d3a6e
Fix DataFrame bounds checking on indexing elements
asmirnov82 May 15, 2023
f91c6bf
Merge pull request #2 from JakeRadMSFT/u/jakerad/generic-math
asmirnov82 May 20, 2023
c1fc16e
Fix build of Microsoft.Data.Analysis.Tests
asmirnov82 May 20, 2023
f551fd1
Merge remote-tracking branch 'origin/main' into jakerad_generic_math
asmirnov82 May 20, 2023
c080c46
Fix merge issues
asmirnov82 May 20, 2023
c4388dc
Fix the behavior or column SetName method
asmirnov82 May 20, 2023
9218375
Merge branch 'fix_utility_parity_foreach' into jakerad_generic_math
asmirnov82 May 21, 2023
8cd3d39
Merge commit 'f4d3a6ecac44fbc83e121c9476fe240bf9329316' into jakerad_…
asmirnov82 May 21, 2023
92e2d72
Reset RowCount to zero, when DataFrame is empty
asmirnov82 May 21, 2023
e88b642
Remove redundant column names collection from DataFrameColumnCollection
asmirnov82 May 21, 2023
91d0f1f
Add missing implementation for datetime relevant arrow type
asmirnov82 May 21, 2023
4ebddeb
Fix DataFrame Merge issue
asmirnov82 May 21, 2023
da2cb99
Clean switch by type in binary operations
asmirnov82 May 25, 2023
1de06be
Simplify getting mutable buffers
asmirnov82 May 25, 2023
6be198d
Don't convert buffer to mutable if it not required
asmirnov82 May 25, 2023
a20d6e0
Merge remote-tracking branch 'origin/main' into jakerad_generic_math
JakeRadMSFT Jun 23, 2023
b78c0a7
Merge branch 'u/jakerad/generic-math' into jakerad_generic_math
JakeRadMSFT Jun 23, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eng/Versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
<SystemTextJsonVersion>6.0.1</SystemTextJsonVersion>
<SystemThreadingChannelsVersion>4.7.1</SystemThreadingChannelsVersion>
<!-- Other product dependencies -->
<ApacheArrowVersion>2.0.0</ApacheArrowVersion>
<ApacheArrowVersion>11.0.0</ApacheArrowVersion>
<GoogleProtobufVersion>3.19.6</GoogleProtobufVersion>
<LightGBMVersion>2.3.1</LightGBMVersion>
<MicrosoftCodeAnalysisAnalyzersVersion>3.3.0</MicrosoftCodeAnalysisAnalyzersVersion>
Expand Down
15 changes: 12 additions & 3 deletions src/Microsoft.Data.Analysis/DataFrame.Arrow.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,18 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray
AppendDataFrameColumnFromArrowArray(fieldsEnumerator.Current, structArrayEnumerator.Current, ret, field.Name + "_");
}
break;
case ArrowTypeId.Decimal:
case ArrowTypeId.Date64:
Date64Array arrowDate64Array = (Date64Array)arrowArray;
dataFrameColumn = new PrimitiveDataFrameColumn<DateTime>(fieldName, arrowDate64Array.Data.Length);
for (int i = 0; i < arrowDate64Array.Data.Length; i++)
{
dataFrameColumn[i] = arrowDate64Array.GetDateTime(i);
}
break;
case ArrowTypeId.Decimal128:
case ArrowTypeId.Decimal256:
case ArrowTypeId.Binary:
case ArrowTypeId.Date32:
case ArrowTypeId.Date64:
case ArrowTypeId.Dictionary:
case ArrowTypeId.FixedSizedBinary:
case ArrowTypeId.HalfFloat:
Expand All @@ -114,6 +122,7 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray
case ArrowTypeId.Null:
case ArrowTypeId.Time32:
case ArrowTypeId.Time64:
case ArrowTypeId.Timestamp:
default:
throw new NotImplementedException($"{fieldType.Name}");
}
Expand Down Expand Up @@ -145,7 +154,7 @@ public static DataFrame FromArrowRecordBatch(RecordBatch recordBatch)
}

/// <summary>
/// Returns an <see cref="IEnumerable{RecordBatch}"/> without copying data
/// Returns an <see cref="IEnumerable{RecordBatch}"/> mostly without copying data
/// </summary>
public IEnumerable<RecordBatch> ToArrowRecordBatches()
{
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/DataFrame.Join.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ private void SetSuffixForDuplicatedColumnNames(DataFrame dataFrame, DataFrameCol
{
// Pre-existing column. Change name
DataFrameColumn existingColumn = dataFrame.Columns[index];
dataFrame._columnCollection.SetColumnName(existingColumn, existingColumn.Name + leftSuffix);
existingColumn.SetName(existingColumn.Name + leftSuffix);
column.SetName(column.Name + rightSuffix);
index = dataFrame._columnCollection.IndexOf(column.Name);
}
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ public DataFrame AddPrefix(string prefix, bool inPlace = false)
for (int i = 0; i < df.Columns.Count; i++)
{
DataFrameColumn column = df.Columns[i];
df._columnCollection.SetColumnName(column, prefix + column.Name);
column.SetName(prefix + column.Name);
df.OnColumnsChanged();
}
return df;
Expand All @@ -316,7 +316,7 @@ public DataFrame AddSuffix(string suffix, bool inPlace = false)
for (int i = 0; i < df.Columns.Count; i++)
{
DataFrameColumn column = df.Columns[i];
df._columnCollection.SetColumnName(column, column.Name + suffix);
column.SetName(column.Name + suffix);
df.OnColumnsChanged();
}
return df;
Expand Down
42 changes: 34 additions & 8 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,26 @@ protected set
}
}

// List of ColumnCollections that owns the column
// Current API allows column to be added into multiple dataframes, that's why the list is needed
private readonly List<DataFrameColumnCollection> _ownerColumnCollections = new();

internal void AddOwner(DataFrameColumnCollection columCollection)
{
if (!_ownerColumnCollections.Contains(columCollection))
{
_ownerColumnCollections.Add(columCollection);
}
}

internal void RemoveOwner(DataFrameColumnCollection columCollection)
{
if (_ownerColumnCollections.Contains(columCollection))
{
_ownerColumnCollections.Remove(columCollection);
}
}

/// <summary>
/// The number of <see langword="null" /> values in this column.
/// </summary>
Expand All @@ -95,24 +115,30 @@ public abstract long NullCount
private string _name;

/// <summary>
/// The name of this column.
/// The column name.
/// </summary>
public string Name => _name;

/// <summary>
/// Updates the name of this column.
/// Updates the column name.
/// </summary>
/// <param name="newName">The new name.</param>
/// <param name="dataFrame">If passed in, update the column name in <see cref="DataFrame.Columns"/></param>
public void SetName(string newName, DataFrame dataFrame = null)
public void SetName(string newName)
{
if (!(dataFrame is null))
{
dataFrame.Columns.SetColumnName(this, newName);
}
foreach (var owner in _ownerColumnCollections)
owner.UpdateColumnNameMetadata(this, newName);

_name = newName;
}

/// <summary>
/// Updates the name of this column.
/// </summary>
/// <param name="newName">The new name.</param>
/// <param name="dataFrame">Ignored (for backward compatibility)</param>
[Obsolete]
public void SetName(string newName, DataFrame dataFrame) => SetName(newName);

/// <summary>
/// The type of data this column holds.
/// </summary>
Expand Down
29 changes: 23 additions & 6 deletions src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

Expand Down Expand Up @@ -38,11 +38,23 @@ internal IReadOnlyList<string> GetColumnNames()
return ret;
}

public void RenameColumn(string currentName, string newName)
{
var column = this[currentName];
column.SetName(newName);
}

[Obsolete]
public void SetColumnName(DataFrameColumn column, string newName)
{
column.SetName(newName);
}

//Updates column's metadata (is used as a callback from Column class)
internal void UpdateColumnNameMetadata(DataFrameColumn column, string newName)
{
string currentName = column.Name;
int currentIndex = _columnNameToIndexDictionary[currentName];
column.SetName(newName);
_columnNameToIndexDictionary.Remove(currentName);
_columnNameToIndexDictionary.Add(newName, currentIndex);
ColumnsChanged?.Invoke();
Expand All @@ -66,7 +78,7 @@ protected override void InsertItem(int columnIndex, DataFrameColumn column)
}
else if (column.Length != RowCount)
{
//check all columns in the dataframe have the same length (amount of rows)
//check all columns in the dataframe have the same lenght (amount of rows)
throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column));
}

Expand All @@ -75,7 +87,7 @@ protected override void InsertItem(int columnIndex, DataFrameColumn column)
throw new ArgumentException(string.Format(Strings.DuplicateColumnName, column.Name), nameof(column));
}

RowCount = column.Length;
column.AddOwner(this);

_columnNameToIndexDictionary[column.Name] = columnIndex;
for (int i = columnIndex + 1; i < Count; i++)
Expand All @@ -100,7 +112,10 @@ protected override void SetItem(int columnIndex, DataFrameColumn column)
}
_columnNameToIndexDictionary.Remove(this[columnIndex].Name);
_columnNameToIndexDictionary[column.Name] = columnIndex;

this[columnIndex].RemoveOwner(this);
base.SetItem(columnIndex, column);

ColumnsChanged?.Invoke();
}

Expand All @@ -111,6 +126,8 @@ protected override void RemoveItem(int columnIndex)
{
_columnNameToIndexDictionary[this[i].Name]--;
}

this[columnIndex].RemoveOwner(this);
base.RemoveItem(columnIndex);

//Reset RowCount if the last column was removed and dataframe is empty
Expand Down Expand Up @@ -204,10 +221,10 @@ public PrimitiveDataFrameColumn<T> GetPrimitiveColumn<T>(string name)
}

/// <summary>
/// Gets the <see cref="PrimitiveDataFrameColumn{DateTime}"/> with the specified <paramref name="name"/>.
/// Gets the <see cref="PrimitiveDataFrameColumn{T}"/> with the specified <paramref name="name"/>.
/// </summary>
/// <param name="name">The name of the column</param>
/// <returns><see cref="PrimitiveDataFrameColumn{DateTime}"/>.</returns>
/// <returns><see cref="PrimitiveDataFrameColumn{T}"/>.</returns>
/// <exception cref="ArgumentException">A column named <paramref name="name"/> cannot be found, or if the column's type doesn't match.</exception>
public PrimitiveDataFrameColumn<DateTime> GetDateTimeColumn(string name)
{
Expand Down
12 changes: 0 additions & 12 deletions src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -374,18 +374,6 @@ internal int MaxRecordBatchLength(long startIndex)
return Buffers[arrayIndex].Length - (int)startIndex;
}

internal ReadOnlyMemory<byte> GetValueBuffer(long startIndex)
{
int arrayIndex = GetArrayContainingRowIndex(startIndex);
return Buffers[arrayIndex].ReadOnlyBuffer;
}

internal ReadOnlyMemory<byte> GetNullBuffer(long startIndex)
{
int arrayIndex = GetArrayContainingRowIndex(startIndex);
return NullBitMapBuffers[arrayIndex].ReadOnlyBuffer;
}

public IReadOnlyList<T?> this[long startIndex, int length]
{
get
Expand Down
57 changes: 44 additions & 13 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using Apache.Arrow;
using Apache.Arrow.Types;
using Microsoft.ML;
Expand Down Expand Up @@ -104,6 +105,8 @@ private IArrowType GetArrowType()
return UInt64Type.Default;
else if (typeof(T) == typeof(ushort))
return UInt16Type.Default;
else if (typeof(T) == typeof(DateTime))
return Date64Type.Default;
else
throw new NotImplementedException(nameof(T));
}
Expand All @@ -127,36 +130,64 @@ protected internal override Apache.Arrow.Array ToArrowArray(long startIndex, int
{
int arrayIndex = numberOfRows == 0 ? 0 : _columnContainer.GetArrayContainingRowIndex(startIndex);
int offset = (int)(startIndex - arrayIndex * ReadOnlyDataFrameBuffer<T>.MaxCapacity);

if (numberOfRows != 0 && numberOfRows > _columnContainer.Buffers[arrayIndex].Length - offset)
{
throw new ArgumentException(Strings.SpansMultipleBuffers, nameof(numberOfRows));
}
ArrowBuffer valueBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.GetValueBuffer(startIndex));
ArrowBuffer nullBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.GetNullBuffer(startIndex));

int nullCount = GetNullCount(startIndex, numberOfRows);

//DateTime requires convertion
if (this.DataType == typeof(DateTime))
{
if (numberOfRows == 0)
return new Date64Array(ArrowBuffer.Empty, ArrowBuffer.Empty, numberOfRows, nullCount, offset);

ReadOnlyDataFrameBuffer<T> valueBuffer = (numberOfRows == 0) ? null : _columnContainer.Buffers[arrayIndex];
ReadOnlyDataFrameBuffer<byte> nullBuffer = (numberOfRows == 0) ? null : _columnContainer.NullBitMapBuffers[arrayIndex];

ReadOnlySpan<DateTime> valueSpan = MemoryMarshal.Cast<T, DateTime>(valueBuffer.ReadOnlySpan);
Date64Array.Builder builder = new Date64Array.Builder().Reserve(valueBuffer.Length);

for (int i = 0; i < valueBuffer.Length; i++)
{
if (BitUtility.GetBit(nullBuffer.ReadOnlySpan, i))
builder.Append(valueSpan[i]);
else
builder.AppendNull();
}

return builder.Build();
}

//No convertion
ArrowBuffer arrowValueBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.Buffers[arrayIndex].ReadOnlyBuffer);
ArrowBuffer arrowNullBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.NullBitMapBuffers[arrayIndex].ReadOnlyBuffer);

Type type = this.DataType;
if (type == typeof(bool))
return new BooleanArray(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new BooleanArray(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(double))
return new DoubleArray(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new DoubleArray(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(float))
return new FloatArray(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new FloatArray(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(int))
return new Int32Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new Int32Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(long))
return new Int64Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new Int64Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(sbyte))
return new Int8Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new Int8Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(short))
return new Int16Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new Int16Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(uint))
return new UInt32Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new UInt32Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(ulong))
return new UInt64Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new UInt64Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(ushort))
return new UInt16Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new UInt16Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else if (type == typeof(byte))
return new UInt8Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset);
return new UInt8Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset);
else
throw new NotImplementedException(type.ToString());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ public void TestArrowIntegration()
.Append("ULongColumn", false, new UInt64Array.Builder().AppendRange(Enumerable.Repeat((ulong)1, 10)).Build())
.Append("ByteColumn", false, new Int8Array.Builder().AppendRange(Enumerable.Repeat((sbyte)1, 10)).Build())
.Append("UByteColumn", false, new UInt8Array.Builder().AppendRange(Enumerable.Repeat((byte)1, 10)).Build())
.Append("Date64Column", false, new Date64Array.Builder().AppendRange(Enumerable.Repeat(DateTime.Now, 10)).Build())
.Build();

DataFrame df = DataFrame.FromArrowRecordBatch(originalBatch);
Expand Down
40 changes: 39 additions & 1 deletion test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

Expand Down Expand Up @@ -388,6 +388,44 @@ public void ClearColumnsTests()
Assert.Equal(0, dataFrame.Columns.LongCount());
}

[Fact]
public void RenameColumnWithSetNameTests()
{
StringDataFrameColumn city = new StringDataFrameColumn("City", new string[] { "London", "Berlin" });
PrimitiveDataFrameColumn<int> temp = new PrimitiveDataFrameColumn<int>("Temperature", new int[] { 12, 13 });

DataFrame dataframe = new DataFrame(city, temp);

// Change the name of the column:
dataframe["City"].SetName("Town");
var renamedColumn = dataframe["Town"];

Assert.Throws<ArgumentException>(() => dataframe["City"]);

Assert.NotNull(renamedColumn);
Assert.Equal("Town", renamedColumn.Name);
Assert.True(ReferenceEquals(city, renamedColumn));
}

[Fact]
public void RenameColumnWithRenameColumnTests()
{
StringDataFrameColumn city = new StringDataFrameColumn("City", new string[] { "London", "Berlin" });
PrimitiveDataFrameColumn<int> temp = new PrimitiveDataFrameColumn<int>("Temperature", new int[] { 12, 13 });

DataFrame dataframe = new DataFrame(city, temp);

// Change the name of the column:
dataframe.Columns.RenameColumn("City", "Town");
var renamedColumn = dataframe["Town"];

Assert.Throws<ArgumentException>(() => dataframe["City"]);

Assert.NotNull(renamedColumn);
Assert.Equal("Town", renamedColumn.Name);
Assert.True(ReferenceEquals(city, renamedColumn));
}

[Fact]
public void TestBinaryOperations()
{
Expand Down
Loading