Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IDataView to DataFrame #5712

Merged
merged 9 commits into from
Mar 22, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 3 additions & 77 deletions src/Microsoft.Data.Analysis/DataFrame.IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,17 @@

using System;
using System.Collections.Generic;
using System.Diagnostics;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Microsoft.Data.Analysis
{
public partial class DataFrame : IDataView
{
{
// TODO: support shuffling
bool IDataView.CanShuffle => false;

private DataViewSchema _schema;
private DataViewSchema DataViewSchema
internal DataViewSchema DataViewSchema
{
get
{
Expand Down Expand Up @@ -53,6 +51,7 @@ private DataViewRowCursor GetRowCursorCore(IEnumerable<DataViewSchema.Column> co

return new RowCursor(this, activeColumns);
}

DataViewRowCursor IDataView.GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand)
{
return GetRowCursorCore(columnsNeeded);
Expand All @@ -63,78 +62,5 @@ DataViewRowCursor[] IDataView.GetRowCursorSet(IEnumerable<DataViewSchema.Column>
// TODO: change to support parallel cursors
return new DataViewRowCursor[] { GetRowCursorCore(columnsNeeded) };
}

private sealed class RowCursor : DataViewRowCursor
{
private bool _disposed;
private long _position;
private readonly DataFrame _dataFrame;
private readonly Delegate[] _getters;

public RowCursor(DataFrame dataFrame, bool[] activeColumns)
{
Debug.Assert(dataFrame != null);
Debug.Assert(activeColumns != null);

_position = -1;
_dataFrame = dataFrame;
_getters = new Delegate[Schema.Count];
for (int i = 0; i < _getters.Length; i++)
{
if (!activeColumns[i])
continue;
_getters[i] = CreateGetterDelegate(i);
Debug.Assert(_getters[i] != null);
}
}

public override long Position => _position;
public override long Batch => 0;
public override DataViewSchema Schema => _dataFrame.DataViewSchema;

protected override void Dispose(bool disposing)
{
if (_disposed)
return;
if (disposing)
{
_position = -1;
}
_disposed = true;
base.Dispose(disposing);
}

private Delegate CreateGetterDelegate(int col)
{
DataFrameColumn column = _dataFrame.Columns[col];
return column.GetDataViewGetter(this);
}

public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column column)
{
if (!IsColumnActive(column))
throw new ArgumentOutOfRangeException(nameof(column));

return (ValueGetter<TValue>)_getters[column.Index];
}

public override ValueGetter<DataViewRowId> GetIdGetter()
{
return (ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0);
}

public override bool IsColumnActive(DataViewSchema.Column column)
{
return _getters[column.Index] != null;
}

public override bool MoveNext()
{
if (_disposed)
return false;
_position++;
return _position < _dataFrame.Rows.Count;
}
}
}
}
15 changes: 15 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,21 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// </param>
protected internal virtual void AddDataViewColumn(DataViewSchema.Builder builder) => throw new NotImplementedException();

/// <summary>
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
/// <param name="ValueGetter">The cached ValueGetter for this column.</param>
internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException();
pgovind marked this conversation as resolved.
Show resolved Hide resolved

/// <summary>
/// Returns the ValueGetter for each active column in <paramref name="cursor"/> as a delegate to be cached.
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();

/// <summary>
/// Clamps values beyond the specified thresholds
/// </summary>
Expand Down
125 changes: 125 additions & 0 deletions src/Microsoft.Data.Analysis/IDataView.Extension.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Data.Analysis;
using Microsoft.ML.Data;

namespace Microsoft.ML
{
public static class IDataViewExtensions
{
private const int defaultMaxRows = 100;

public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows)
{
return ToDataFrame(dataView, maxRows, null);
}

public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
{
return ToDataFrame(dataView, defaultMaxRows, selectColumns);
}

public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
{
DataViewSchema schema = dataView.Schema;
List<DataFrameColumn> columns = new List<DataFrameColumn>(schema.Count);

HashSet<string> selectColumnsSet = null;
if (selectColumns != null && selectColumns.Length > 0)
{
selectColumnsSet = new HashSet<string>(selectColumns);
}

List<DataViewSchema.Column> activeColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column column in schema)
{
if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name)))
{
continue;
}

activeColumns.Add(column);
DataViewType type = column.Type;
if (type == BooleanDataViewType.Instance)
{
columns.Add(new BooleanDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Byte)
{
columns.Add(new ByteDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Double)
{
columns.Add(new DoubleDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Single)
{
columns.Add(new SingleDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Int32)
{
columns.Add(new Int32DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Int64)
{
columns.Add(new Int64DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.SByte)
{
columns.Add(new SByteDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Int16)
{
columns.Add(new Int16DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.UInt32)
{
columns.Add(new UInt32DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.UInt64)
{
columns.Add(new UInt64DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.UInt16)
{
columns.Add(new UInt16DataFrameColumn(column.Name));
}
else if (type == TextDataViewType.Instance)
{
columns.Add(new StringDataFrameColumn(column.Name));
}
else
{
throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name));
Copy link
Contributor Author

@pgovind pgovind Mar 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will cause a problem for vector types in IDataView I think. We'd need to add support for vector columns in DataFrame to fix this. I'll open a bug

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
}

List<Delegate> activeColumnDelegates = new List<Delegate>();
pgovind marked this conversation as resolved.
Show resolved Hide resolved

DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns);
pgovind marked this conversation as resolved.
Show resolved Hide resolved
int columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
{
Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column);
activeColumnDelegates.Add(valueGetter);
columnIndex++;
}
while (cursor.MoveNext() && cursor.Position < maxRows)
{
columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
{
columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]);
columnIndex++;
}
}

return new DataFrame(columns);
}
}

}
26 changes: 26 additions & 0 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -775,5 +775,31 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs

private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;

internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter)
{
long row = cursor.Position;
T value = default;
Debug.Assert(getter != null, "Excepted getter to be valid");
(getter as ValueGetter<T>)(ref value);

if (Length > row)
{
this[row] = value;
}
else if (Length == row)
{
Append(value);
}
else
{
throw new IndexOutOfRangeException(nameof(row));
}
}

internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
return cursor.GetGetter<T>(schemaColumn);
}
}
}
98 changes: 98 additions & 0 deletions src/Microsoft.Data.Analysis/RowCursor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Diagnostics;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Microsoft.Data.Analysis
{
internal sealed class RowCursor : DataViewRowCursor
pgovind marked this conversation as resolved.
Show resolved Hide resolved
{
private bool _disposed;
private long _position;
private readonly DataFrame _dataFrame;
internal readonly List<Delegate> _getters;
private Dictionary<int, int> _columnIndexToGetterIndex;

public RowCursor(DataFrame dataFrame, bool[] activeColumns)
{
Debug.Assert(dataFrame != null);
Debug.Assert(activeColumns != null);

_columnIndexToGetterIndex = new Dictionary<int, int>();
_position = -1;
_dataFrame = dataFrame;
_getters = new List<Delegate>();
for (int i = 0; i < Schema.Count; i++)
{
if (!activeColumns[i])
{
continue;
}

Delegate getter = CreateGetterDelegate(i);
_getters.Add(getter);
Debug.Assert(getter != null);
_columnIndexToGetterIndex[i] = _getters.Count - 1;
pgovind marked this conversation as resolved.
Show resolved Hide resolved
}
}

public override long Position => _position;
public override long Batch => 0;
public override DataViewSchema Schema => _dataFrame.DataViewSchema;

protected override void Dispose(bool disposing)
{
if (_disposed)
{
return;
}

if (disposing)
{
_position = -1;
}

_disposed = true;
base.Dispose(disposing);
}

private Delegate CreateGetterDelegate(int col)
{
DataFrameColumn column = _dataFrame.Columns[col];
return column.GetDataViewGetter(this);
}

public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column column)
{
if (!IsColumnActive(column))
throw new ArgumentOutOfRangeException(nameof(column));

return (ValueGetter<TValue>)_getters[_columnIndexToGetterIndex[column.Index]];
}

public override ValueGetter<DataViewRowId> GetIdGetter()
{
return (ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0);
}

public override bool IsColumnActive(DataViewSchema.Column column)
{
return _getters[_columnIndexToGetterIndex[column.Index]] != null;
}

public override bool MoveNext()
{
if (_disposed)
{
return false;
}
_position++;
return _position < _dataFrame.Rows.Count;
}
}
}
Loading