Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extension for DataFrame + Jupyter notebooks that adds properties that return concrete columns #25

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 192 additions & 15 deletions src/Microsoft.Data.Analysis.Interactive/DataFrameKernelExtension.cs
Original file line number Diff line number Diff line change
@@ -1,47 +1,224 @@
// Copyright (c) .NET Foundation and contributors. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System;
using System.CommandLine;
using System.CommandLine.Invocation;
using System.IO;
using System.Text;
using System.Threading.Tasks;
using Microsoft.DotNet.Interactive;
using Microsoft.DotNet.Interactive.Commands;
using Microsoft.DotNet.Interactive.CSharp;
using Microsoft.DotNet.Interactive.Events;
using Microsoft.DotNet.Interactive.Formatting;

namespace Microsoft.Data.Analysis.Interactive
{
public class DataFrameKernelExtension : IKernelExtension
{
public Task OnLoadAsync(IKernel kernel)
bool _generateCsvMethod = false;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be better to flow this bool down through the method calls as a parameter, instead of setting a field property here? What happens if multiple threads are calling this object at the same time?


public string GetFriendlyName(Type type)
{
//Formatter<DataFrame>.Register((tree, writer) =>
//{
// writer.Write("");
//}, "text/html");
string friendlyName = type.Name;
if (type.IsArray)
{
// Not handled yet
return "DataFrameColumn";
}
if (type.IsGenericType)
{
int backTick = friendlyName.IndexOf('`');
if (backTick > 0)
{
friendlyName = friendlyName.Remove(backTick);
}
friendlyName += "<";
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels like there should be an existing utility function somewhere that does this: Given a Type, write out its language specific name.

Note that as written, this will only work for C#.

Type[] typeParameters = type.GetGenericArguments();
for (int i = 0; i < typeParameters.Length; ++i)
{
string typeParamName = GetFriendlyName(typeParameters[i]);
friendlyName += (i == 0 ? typeParamName : "," + typeParamName);
}
friendlyName += ">";
}

return friendlyName;
}

public StringBuilder GetTypedDataFrameWithProperties(DataFrame dataFrame, string resultTypeName, out StringBuilder prettyFormatter)
{
prettyFormatter = new StringBuilder();
StringBuilder stringBuilder = new StringBuilder();
string constructor = @$"
public class {resultTypeName} : DataFrame
{{
public {resultTypeName}(DataFrame dataFrame)
{{
foreach (var column in dataFrame.Columns)
{{
Columns.Add(column);
}}
}}

";
stringBuilder.Append(constructor);
prettyFormatter.Append(constructor);

foreach (var column in dataFrame.Columns)
{
string columnName = column.Name.Replace(" ", string.Empty);
Type dataType = column.DataType;
string typeName = GetFriendlyName(column.GetType());
stringBuilder.Append($@"
public {typeName} {columnName}
{{
get
{{
int columnIndex = Columns.IndexOf(""{columnName}"");
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why isn't this just return Columns[""{columnName}""];?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because Columns[columnName] doesn't exist on the DataFrame side in 0.2.0.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then why not return this[""{columnName}""];?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because return this["columnName"] will go away in the next preview in favor of Column[columnName].

return Columns[columnIndex] as {typeName};
}}
}}");
stringBuilder.AppendLine();
prettyFormatter.Append($@"
public {typeName} {columnName} {{ get; }}");
prettyFormatter.AppendLine();
}

stringBuilder.AppendLine();
stringBuilder.Append(@"}");
prettyFormatter.AppendLine();
prettyFormatter.Append(@"}");
prettyFormatter.AppendLine();
if (_generateCsvMethod)
{
AddLoadCsvToTypedDataFrame(stringBuilder, resultTypeName, prettyFormatter);
}
return stringBuilder;
}

public void AddLoadCsvToTypedDataFrame(StringBuilder stringBuilder, string resultTypeName, StringBuilder prettyFormatter)
{
string loadCsv = $@"
public static new {resultTypeName} LoadCsv(string filename,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
int numRows = -1, int guessRows = 10,
bool addIndexColumn = false)";
stringBuilder.Append($@"{loadCsv}
{{
DataFrame df = DataFrame.LoadCsv(filename: filename, separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numRows: numRows,
guessRows: guessRows, addIndexColumn: addIndexColumn);
{resultTypeName} ret = new {resultTypeName}(df);
return ret;
}}"

);
prettyFormatter.Append(loadCsv);
prettyFormatter.AppendLine();
}

public async Task HandleCsvAsync(FileInfo csv, KernelInvocationContext context)
{
_generateCsvMethod = true;
// Infer the type and generated name from fileName
string fileName = csv.Name.Split('.')[0]; //Something like housing.A.B.csv would return housing
string typeName = fileName.Replace(" ", "");
StringBuilder strBuilder;
StringBuilder prettyFormatter;
using (var stream = csv.Open(FileMode.Open))
{
DataFrame df = DataFrame.LoadCsv(stream);
strBuilder = GenerateTypedDataFrame(df, typeName, context, out StringBuilder outPrettyFormatter);
prettyFormatter = outPrettyFormatter;
}
_generateCsvMethod = false;

// Create a new DataFrame var called dataFrameName
string dataFrameName = typeName + "DataFrame";
strBuilder.AppendLine();
string buildNamedDataFrame = $@"
DataFrame _df = DataFrame.LoadCsv(filename: @""{csv.FullName}"");
{typeName} {dataFrameName} = new {typeName}(_df);
";
strBuilder.Append(buildNamedDataFrame);
prettyFormatter.AppendLine();
prettyFormatter.Append(buildNamedDataFrame);
await context.DisplayAsync(prettyFormatter.ToString());
context.Publish(new DisplayedValueProduced($"Created {typeName} {dataFrameName}: ", context.Command));

await SubmitCodeToKernel(strBuilder, context);
}

public StringBuilder GenerateTypedDataFrame(DataFrame df, string typeName, KernelInvocationContext context, out StringBuilder prettyFormatter)
{
StringBuilder typedDataFrame = GetTypedDataFrameWithProperties(df, typeName, out prettyFormatter);
return typedDataFrame;
}

private async Task SubmitCodeToKernel(StringBuilder code, KernelInvocationContext context)
{
var command = new SubmitCode(code.ToString());
await context.HandlingKernel.SendAsync(command);
}

public async Task HandleDataFrameAsync(string dataFrameName, string typeName, KernelInvocationContext context)
{
if (context.HandlingKernel is CSharpKernel cSharp)
{
System.Collections.Immutable.ImmutableArray<CodeAnalysis.Scripting.ScriptVariable> variables = cSharp.ScriptState.Variables;
for (int i = 0; i < variables.Length; i++)
{
CodeAnalysis.Scripting.ScriptVariable variable = variables[i];
if ((dataFrameName == null || variable.Name == dataFrameName) && variable.Value is DataFrame df)
{
var strBuilder = GenerateTypedDataFrame(df, typeName, context, out StringBuilder prettyFormatter);
await context.DisplayAsync(prettyFormatter.ToString());
await SubmitCodeToKernel(strBuilder, context);
}
}
}
}

public Task OnLoadAsync(IKernel kernel)
{
var kernelBase = kernel as KernelBase;
var directive = new Command("#!doit")
var directive = new Command("#!generatedataframe")
{
Handler = CommandHandler.Create(async (FileInfo csv, string typeName, KernelInvocationContext context) =>
Handler = CommandHandler.Create(async (FileInfo csv, string dataFrameName, string typeName, KernelInvocationContext context) =>
{
// do the job
var command = new SubmitCode(@$"public class {typeName}{{}}");
context.Publish(new DisplayedValueProduced($"emitting {typeName} from {csv.FullName}", context.Command));
await context.HandlingKernel.SendAsync(command);
try
{
if (csv != null)
{
HandleCsvAsync(csv, context);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These HandleXXXAsync calls need to be await'd.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why doesn't HandleCsvAsync get the --type-name and --dataframe-name parameters passed to it?

}
else
{
HandleDataFrameAsync(dataFrameName, typeName, context);
}
}
catch (Exception)
{
await context.DisplayAsync($"Encountered an exception. Could not create type { (csv != null ? csv.Name : typeName)}");
}
})
};

directive.AddOption(new Option<FileInfo>(
"csv").ExistingOnly());
"--csv", "Read in a csv file into a DataFrame with strong properties. Also emits the generated DataFrame type").ExistingOnly());

directive.AddOption(new Option<string>(
"typeName",
getDefaultValue:() => "Foo"));
"--type-name",
getDefaultValue: () => "InteractiveDataFrame",
"The name of the generated DataFrame type. Defaults to InteractiveDataFrame"));

directive.AddOption(new Option<string>(
"--dataframe-name",
"The DataFrame variable to generate type information for"));
kernelBase.AddDirective(directive);

return Task.CompletedTask;

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
<PackageReference Include="Microsoft.Data.Analysis" Version="0.2.0" />
<PackageReference Include="Microsoft.DotNet.Interactive" Version="1.0.0-beta.20074.3" />
<PackageReference Include="Microsoft.DotNet.Interactive.Formatting" Version="1.0.0-beta.20074.3" />
<PackageReference Include="Microsoft.DotNet.Interactive.CSharp" Version="1.0.0-beta.20074.3" />
</ItemGroup>

</Project>