diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml new file mode 100644 index 0000000..f8e5ff5 --- /dev/null +++ b/.github/workflows/macos-ci.yml @@ -0,0 +1,26 @@ +# This workflow will build a .NET project +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-net + +name: MacOS Build and Test +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: macos-latest + + steps: + - uses: actions/checkout@v4 + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: 8.0.x + - name: Restore dependencies + run: dotnet restore + - name: Build + run: dotnet build --no-restore + #- name: Test + # run: dotnet test --no-build --verbosity normal \ No newline at end of file diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml new file mode 100644 index 0000000..1ee6be9 --- /dev/null +++ b/.github/workflows/ubuntu-ci.yml @@ -0,0 +1,26 @@ +# This workflow will build a .NET project +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-net + +name: Ubuntu Build and Test +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: 8.0.x + - name: Restore dependencies + run: dotnet restore + - name: Build + run: dotnet build --no-restore + - name: Test + run: dotnet test --no-build --verbosity normal \ No newline at end of file diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml new file mode 100644 index 0000000..149a70a --- /dev/null +++ b/.github/workflows/windows-ci.yml @@ -0,0 +1,26 @@ +# This workflow will build a .NET project +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-net + +name: Windows Build and Test +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v4 + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: 8.0.x + - name: Restore dependencies + run: dotnet restore + - name: Build + run: dotnet build --no-restore + - name: Test + run: dotnet test --no-build --verbosity normal \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..6b6006e --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,26 @@ +{ + "version": "0.2.0", + "configurations": [ + { + // Use IntelliSense to find out which attributes exist for C# debugging + // Use hover for the description of the existing attributes + // For further information visit https://github.com/dotnet/vscode-csharp/blob/main/debugger-launchjson.md + "name": ".NET Core Launch (console)", + "type": "coreclr", + "request": "launch", + "preLaunchTask": "build", + // If you have changed target frameworks, make sure to update the program path. + "program": "${workspaceFolder}/test/Pandas.NET.Test/bin/Debug/net8.0/Pandas.NET.Test.dll", + "args": [], + "cwd": "${workspaceFolder}/test/Pandas.NET.Test", + // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console + "console": "internalConsole", + "stopAtEntry": false + }, + { + "name": ".NET Core Attach", + "type": "coreclr", + "request": "attach" + } + ] +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..5fd1dbd --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,41 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "build", + "command": "dotnet", + "type": "process", + "args": [ + "build", + "${workspaceFolder}/Pandas.NET.sln", + "/property:GenerateFullPaths=true", + "/consoleloggerparameters:NoSummary;ForceNoAlign" + ], + "problemMatcher": "$msCompile" + }, + { + "label": "publish", + "command": "dotnet", + "type": "process", + "args": [ + "publish", + "${workspaceFolder}/Pandas.NET.sln", + "/property:GenerateFullPaths=true", + "/consoleloggerparameters:NoSummary;ForceNoAlign" + ], + "problemMatcher": "$msCompile" + }, + { + "label": "watch", + "command": "dotnet", + "type": "process", + "args": [ + "watch", + "run", + "--project", + "${workspaceFolder}/Pandas.NET.sln" + ], + "problemMatcher": "$msCompile" + } + ] +} \ No newline at end of file diff --git a/Pandas.NET.sln b/Pandas.NET.sln index 733a792..5290433 100644 --- a/Pandas.NET.sln +++ b/Pandas.NET.sln @@ -1,41 +1,50 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.28010.2050 +# Visual Studio Version 17 +VisualStudioVersion = 17.4.33103.184 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Pandas.NET.Test", "test\Pandas.NET.Test\Pandas.NET.Test.csproj", "{D075CE95-F14C-47D6-A64A-73DFDCF3E80A}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "libs", "libs", "{397A50A6-456B-4EF2-AABC-03A4A8CE5BCE}" -EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Pandas.NET", "src\Pandas.NET\Pandas.NET.csproj", "{41658850-1B21-4409-BC35-D0E8A7B73661}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NumSharp.Core", "src\NumSharp.Core\src\NumSharp.Core\NumSharp.Core.csproj", "{30301B04-C061-4B63-A33C-5BA66D8501EE}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Pandas.Console", "test\PandasConsole\PandasConsole.csproj", "{16298C82-7219-4F52-ACD1-E0AFE650F207}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Debug|x64.ActiveCfg = Debug|Any CPU + {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Debug|x64.Build.0 = Debug|Any CPU {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Release|Any CPU.ActiveCfg = Release|Any CPU {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Release|Any CPU.Build.0 = Release|Any CPU + {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Release|x64.ActiveCfg = Release|Any CPU + {D075CE95-F14C-47D6-A64A-73DFDCF3E80A}.Release|x64.Build.0 = Release|Any CPU {41658850-1B21-4409-BC35-D0E8A7B73661}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {41658850-1B21-4409-BC35-D0E8A7B73661}.Debug|Any CPU.Build.0 = Debug|Any CPU + {41658850-1B21-4409-BC35-D0E8A7B73661}.Debug|x64.ActiveCfg = Debug|Any CPU + {41658850-1B21-4409-BC35-D0E8A7B73661}.Debug|x64.Build.0 = Debug|Any CPU {41658850-1B21-4409-BC35-D0E8A7B73661}.Release|Any CPU.ActiveCfg = Release|Any CPU {41658850-1B21-4409-BC35-D0E8A7B73661}.Release|Any CPU.Build.0 = Release|Any CPU - {30301B04-C061-4B63-A33C-5BA66D8501EE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {30301B04-C061-4B63-A33C-5BA66D8501EE}.Debug|Any CPU.Build.0 = Debug|Any CPU - {30301B04-C061-4B63-A33C-5BA66D8501EE}.Release|Any CPU.ActiveCfg = Release|Any CPU - {30301B04-C061-4B63-A33C-5BA66D8501EE}.Release|Any CPU.Build.0 = Release|Any CPU + {41658850-1B21-4409-BC35-D0E8A7B73661}.Release|x64.ActiveCfg = Release|Any CPU + {41658850-1B21-4409-BC35-D0E8A7B73661}.Release|x64.Build.0 = Release|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Debug|Any CPU.Build.0 = Debug|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Debug|x64.ActiveCfg = Debug|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Debug|x64.Build.0 = Debug|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Release|Any CPU.ActiveCfg = Release|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Release|Any CPU.Build.0 = Release|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Release|x64.ActiveCfg = Release|Any CPU + {16298C82-7219-4F52-ACD1-E0AFE650F207}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection - GlobalSection(NestedProjects) = preSolution - {30301B04-C061-4B63-A33C-5BA66D8501EE} = {397A50A6-456B-4EF2-AABC-03A4A8CE5BCE} - EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {D6326D52-F6B8-4EDC-AF3D-36766F57C9E9} EndGlobalSection diff --git a/README.cn.md b/README.cn.md new file mode 100644 index 0000000..4ca2e16 --- /dev/null +++ b/README.cn.md @@ -0,0 +1,57 @@ +# Pandas.NET + +[![Join the chat at https://gitter.im/publiclab/publiclab](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/sci-sharp/community) +[![NuGet](https://img.shields.io/nuget/dt/Pandas.NET.svg)](https://www.nuget.org/packages/Pandas.NET) +[![Build Status](https://dev.azure.com/scisharp/Pandas.NET/_apis/build/status/Windows%20CI?branchName=master&label=Windows)](https://dev.azure.com/scisharp/Pandas.NET/_build/latest?definitionId=2&branchName=master) +[![Build Status](https://dev.azure.com/scisharp/Pandas.NET/_apis/build/status/Ubuntu%20CI?branchName=master&label=Ubuntu)](https://dev.azure.com/scisharp/Pandas.NET/_build/latest?definitionId=3&branchName=master) +[![Build Status](https://dev.azure.com/scisharp/Pandas.NET/_apis/build/status/macOS%20CI?branchName=master&label=MacOS)](https://dev.azure.com/scisharp/Pandas.NET/_build/latest?definitionId=1&branchName=master) + +## Implemented APIs + +### 1. Pandas + +- DataFrame + - `pd.DataFrame(NDArray data, IList index, IList columns, Type dtype)` + - `pd.DataFrame(NDArray data, IList index, IList columns, Type dtype)` + - `pd.DataFrame(IDictionary data, IList index)` + - `pd.DataFrame(IDictionary data, IList index)` +- Series + - `pd.Series(NDArray data)` + - `pd.Series(Array data)` + - `pd.Series(T data)` + +### 2. Series + +- `s.iloc[0]`:按索引选取数据 + +- `s.loc["index_label"]`:按索引标签选取数据 + +### 3. DataFrame + +#### 结构 + +- df.Index +- df.Columns +- df.Values +- df.Shape +- df.NDIM +- df.Size + +#### 方法 + +- `df[0]`:按列索引选取数据(返回 Series) +- `df[params int[] columnIndexs] `:按列索引选取数据(返回 DataFrame) +- `df["column_label"]`:按列标签选取数据(返回 Series);可通过 `set` 访问器增加列(如果列标签不存在) +- `df[params string[] columnLabels]`:按列标签选取数据(返回 DataFrame) +- `df.Column(string columnLabel, NDArray value)`:设置列以及列的值;当列不存在时创建 +- `df.Column(int columnIndex, NDArray value)`:设置列以及列的值;当列不存在时报异常 +- `df[Slice s]`:行切片选取数据 + +- `df.loc["index_label"]`:按行索引标签选取数据 +- `df.loc["index_label", "column_label"]`:按行和列标签选取数据 +- `df.iloc[0]`:按行索引(row number)选取数据 + +## 函数库 + +- [Math.NET Numerics](https://numerics.mathdotnet.com/) + - [MIT License](https://github.com/mathnet/mathnet-numerics/blob/master/LICENSE.md) diff --git a/README.md b/README.md index cd61f7d..aea6811 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,57 @@ # Pandas.NET - +[![Join the chat at https://gitter.im/publiclab/publiclab](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/sci-sharp/community) +[![NuGet](https://img.shields.io/nuget/dt/Pandas.NET.svg)](https://www.nuget.org/packages/Pandas.NET) +[![Build Status](https://dev.azure.com/scisharp/Pandas.NET/_apis/build/status/Windows%20CI?branchName=master&label=Windows)](https://dev.azure.com/scisharp/Pandas.NET/_build/latest?definitionId=2&branchName=master) +[![Build Status](https://dev.azure.com/scisharp/Pandas.NET/_apis/build/status/Ubuntu%20CI?branchName=master&label=Ubuntu)](https://dev.azure.com/scisharp/Pandas.NET/_build/latest?definitionId=3&branchName=master) +[![Build Status](https://dev.azure.com/scisharp/Pandas.NET/_apis/build/status/macOS%20CI?branchName=master&label=MacOS)](https://dev.azure.com/scisharp/Pandas.NET/_build/latest?definitionId=1&branchName=master) ## Implemented APIs ### 1. Pandas -* DataFrame - * `pd.DataFrame(NDArray data, IList index, IList columns, Type dtype)` - * `pd.DataFrame(NDArray data, IList index, IList columns, Type dtype)` - * `pd.DataFrame(IDictionary data, IList index)` - * `pd.DataFrame(IDictionary data, IList index)` -* Series - * `pd.Series(NDArray data, IDataIndex index=null)` - * `pd.Series(T data, IDataIndex index=null)` +- DataFrame + - `pd.DataFrame(NDArray data, IList index, IList columns, Type dtype)` + - `pd.DataFrame(NDArray data, IList index, IList columns, Type dtype)` + - `pd.DataFrame(IDictionary data, IList index)` + - `pd.DataFrame(IDictionary data, IList index)` +- Series + - `pd.Series(NDArray data)` + - `pd.Series(Array data)` + - `pd.Series(T data)` -### 2. Series +### 2. Series -* `s.iloc[0]`:按索引选取数据 +- `s.iloc[0]`: Select data by index -* `s.loc["index_label"]`:按索引标签选取数据 +- `s.loc["index_label"]`: Select data by index label ### 3. DataFrame -#### 结构 +#### Structure + +- df.Index +- df.Columns +- df.Values +- df.Shape +- df.NDIM +- df.Size -* df.Index -* df.Columns -* df.Values -* df.Shape -* df.NDIM -* df.Size +#### Method -#### 方法 +- `df[0]`: Select data by column index (returns Series) +- `df[params int[] columnIndexs] `: Select data by column index (returns DataFrame) +- `df["column_label"]`: Select data by column label (returns Series); accessible `set` accessor increase column (if the column label does not exist) +- `df[params string[] columnLabels]`: Select data by column label (returns DataFrame) +- `df.Column(string columnLabel, NDArray value)`: Set the column and its value; create when the column does not exist +- `df.Column(int columnIndex, NDArray value)`: Set the column and the value of the column; when the column does not exist, an exception is reported +- `df[Slice s]`: Row slice selection data -* `df[0]`:按列索引选取数据(返回Series) -* `df[params int[] columnIndexs] `:按列索引选取数据(返回DataFrame) -* `df["column_label"]`:按列标签选取数据(返回Series);可通过 `set` 访问器增加列(如果列标签不存在) -* `df[params string[] columnLabels]`:按列标签选取数据(返回DataFrame) -* `df.Column(string columnLabel, NDArray value)`:设置列以及列的值;当列不存在时创建 -* `df.Column(int columnIndex, NDArray value)`:设置列以及列的值;当列不存在时报异常 -* df[Slice s]:行切片选取数据 +- `df.loc["index_label"]`: Select data by row index label +- `df.loc["index_label", "column_label"]`: Select data by row and column labels +- `df.iloc[0]`: Index by row (row number) select data -* `df.loc["index_label"]`:按行索引标签选取数据 -* `df.loc["index_label", "column_label"]`:按行和列标签选取数据 -* `df.iloc[0]`:按行索引(row number)选取数据 +## Libraries +- [Math.NET Numerics](https://numerics.mathdotnet.com/) + - [MIT License](https://github.com/mathnet/mathnet-numerics/blob/master/LICENSE.md) diff --git a/src/NumSharp.Core b/src/NumSharp.Core deleted file mode 160000 index ffe0da3..0000000 --- a/src/NumSharp.Core +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ffe0da3679f98eb8361ef73161fef1aa588d3ac7 diff --git a/src/Pandas.NET/Columns/Column.cs b/src/Pandas.NET/Columns/Column.cs new file mode 100644 index 0000000..32940a9 --- /dev/null +++ b/src/Pandas.NET/Columns/Column.cs @@ -0,0 +1,27 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace PandasNet +{ + public class Column + { + public int Index { get; set; } + public string Name { get; set; } + public Type DType { get; set; } + + + public Column() + { + } + + public Column(string name, Type dtype) + { + Name = name; + DType = dtype; + } + + public override string ToString() + => $"{Name} {DType}"; + } +} diff --git a/src/Pandas.NET/DataFrameApi.cs b/src/Pandas.NET/DataFrameApi.cs new file mode 100644 index 0000000..5adc49f --- /dev/null +++ b/src/Pandas.NET/DataFrameApi.cs @@ -0,0 +1,57 @@ +using Newtonsoft.Json; +using Newtonsoft.Json.Linq; +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; + +namespace PandasNet +{ + public class DataFrameApi + { + public DataFrame from_dict(string data) + { + // data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + var json = (JObject)JsonConvert.DeserializeObject(data); + + var cols = new List(); + foreach (var col in json) + { + var column = new Column + { + Name = col.Key + }; + + var type = col.Value.First().Type; + if (type == JTokenType.Integer) + { + var array = GetArray(col.Value); + column.DType = typeof(int); + var series = new Series(array, column); + cols.Add(series); + } + else if (type == JTokenType.String) + { + var array = GetArray(col.Value); + column.DType = typeof(string); + var series = new Series(array, column); + cols.Add(series); + }; + } + var df = new DataFrame(cols); + foreach (var s in df.data) + { + s.SetIndex(df.index); + } + return df; + } + + T[] GetArray(JToken values) + { + var array = new List(); + foreach (var row in values) + array.Add(row.Value()); + return array.ToArray(); + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.Implicit.cs b/src/Pandas.NET/DataFrames/DataFrame.Implicit.cs new file mode 100644 index 0000000..3a91156 --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.Implicit.cs @@ -0,0 +1,9 @@ +using Tensorflow.NumPy; + +namespace PandasNet; + +public partial class DataFrame +{ + public static implicit operator NDArray(DataFrame df) + => df.to_numpy(); +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.Index.cs b/src/Pandas.NET/DataFrames/DataFrame.Index.cs new file mode 100644 index 0000000..4c685ce --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.Index.cs @@ -0,0 +1,92 @@ +using System; +using System.Collections.Generic; +using System.Data.Common; +using System.Linq; +using System.Text; +using Tensorflow; + +namespace PandasNet +{ + public partial class DataFrame + { + public DataFrame this[Slice slice] + { + get => Slice(slice); + } + + public object this[int row, string columName] + { + get + { + return _data.FirstOrDefault(x => x.name == columName).GetValue(row); + } + } + + public Series this[string columName] + { + get + { + return _data.FirstOrDefault(x => x.name == columName); + } + + set + { + _data.Remove(_data.FirstOrDefault(x => x.name == columName)); + _columns.Remove(_columns.FirstOrDefault(x => x.Name == columName)); + + _data.Add(value); + _columns.Add(new Column + { + Name = columName, + DType = value.dtype + }); + } + } + + public DataFrame this[params string[] columNames] + { + get => new DataFrame(_data.Where(x => columNames.Contains(x.name)).ToList()); + } + + DataFrame Slice(Slice slice) + { + var start = slice.Start ?? 0; + var stop = slice.Stop ?? _index.size; + var step = slice.Step; + var rowCount = (stop - start) / step; + + var data1 = new List(); + for (int col = 0; col < _columns.Count; col++) + { + var series = new Series(_columns[col]); + series.Allocate(rowCount); + data1.Add(series); + } + + var data1RowIndex = 0; + var index = new Series(new Column + { + Name = _index.name, + DType = _index.dtype + }); + index.Allocate(rowCount); + for (int row = start; row < stop; row += step) + { + if (data1RowIndex >= rowCount) + break; + + for (int col = 0; col < _columns.Count; col++) + { + data1[col].SetValue(_data[col].GetValue(row), data1RowIndex); + } + index.SetValue(_index.GetValue(row), data1RowIndex); + data1RowIndex++; + } + + foreach (var d in data1) + d.SetIndex(index); + + return new DataFrame(data1, index, _columns); + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.ToString.cs b/src/Pandas.NET/DataFrames/DataFrame.ToString.cs new file mode 100644 index 0000000..c28c856 --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.ToString.cs @@ -0,0 +1,35 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; + +namespace PandasNet +{ + public partial class DataFrame + { + public override string ToString() + { + var display = " "; + var header = string.Join(" ", _columns.Select(x => x.Name)); + display += header; + + for (int i = 0; i < _index.size; i++) + { + if (i > 4) break; + var values = new List(); + values.Add(_index.GetValue(i).ToString()); + for(int col = 0; col < _columns.Count; col++) + { + var value = _data[col].GetValue(i); + if (value is double float64) + values.Add(Convert.ToSingle(float64).ToString()); + else + values.Add(value.ToString()); + } + display += "\r\n" + string.Join(" ", values); + } + + return display; + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.copy.cs b/src/Pandas.NET/DataFrames/DataFrame.copy.cs new file mode 100644 index 0000000..f8c3729 --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.copy.cs @@ -0,0 +1,24 @@ +using System.Collections.Generic; +using System.Linq; + +namespace PandasNet; + +public partial class DataFrame +{ + public DataFrame copy() + { + var data = new List(); + foreach (var s in _data) + { + data.Add(s.copy()); + } + + var columns = new List(); + foreach(var c in _columns) + { + columns.Add(new Column { Name = c.Name, DType = c.DType }); + } + + return new DataFrame(data, columns: columns, index: _index.copy()); + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.cs b/src/Pandas.NET/DataFrames/DataFrame.cs new file mode 100644 index 0000000..b9a05ea --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.cs @@ -0,0 +1,56 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PandasNet +{ + public partial class DataFrame + { + private List _data; + public List data => _data; + + private Series _index; + public Series index + { + get => _index; + set => _index = value; + } + + private List _columns; + public List columns => _columns; + + public int ndim => _shape.Length; + + private int[] _shape; + public int[] shape => _shape; + + public DataFrame(List data, Series index = null, List columns = null, bool copy = false) + { + if(index == null) + { + index = new Series(Enumerable.Range(0, data[0].size).ToArray()); + } + + if(columns == null) + { + columns = data.Select(x => x.column).ToList(); + } + + _data = data; + _index = index; + _columns = columns; + + _shape = new int[] + { + index.size, + columns.Count + }; + + foreach (var s in _data) + { + s.SetIndex(_index); + } + + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.describe.cs b/src/Pandas.NET/DataFrames/DataFrame.describe.cs new file mode 100644 index 0000000..1d380be --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.describe.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; + +namespace PandasNet +{ + public partial class DataFrame + { + public DataFrame describe() + { + var stat_index = new Series(new string[] + { + "count", "mean", "std", "min", "25%", "50%", "75%", "max" + }); + + var data = _data.Where(s => new Type[] { typeof(int[]), typeof(float[]), typeof(double[]) }.Contains(s.data.GetType())) + .Select(x => + { + var series = new Series(new double[] + { + x.count(), + x.mean(), + x.std(), + x.min(), + x.q1(), + x.q2(), + x.q3(), + x.max() + }, new Column + { + DType = typeof(double), + Name = x.name + }); + series.SetIndex(stat_index); + return series; + }).ToList(); + + return new DataFrame(data, index: stat_index); + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.drop.cs b/src/Pandas.NET/DataFrames/DataFrame.drop.cs new file mode 100644 index 0000000..34260e4 --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.drop.cs @@ -0,0 +1,19 @@ +using System.Collections.Generic; +using System.Linq; + +namespace PandasNet; + +public partial class DataFrame +{ + public DataFrame drop(int[] index = null) + { + var data = new List(); + foreach (var s in _data) + { + var series = s.drop(index); + data.Add(series); + } + var index2 = _index.array().Where(x => !index.Contains(x)).ToArray(); + return new DataFrame(data, columns: _columns, index: new Series(index2)); + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.dropna.cs b/src/Pandas.NET/DataFrames/DataFrame.dropna.cs new file mode 100644 index 0000000..45e1b9d --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.dropna.cs @@ -0,0 +1,31 @@ +using System.Collections.Generic; +using System.Linq; + +namespace PandasNet; + +public partial class DataFrame +{ + public DataFrame dropna() + { + var excludeRowIndex = new List(); + for (int i = 0; i < _index.size; i++) + { + if (HasNullValue(_data, i)) + excludeRowIndex.Add(i); + } + + var excludeRowIndexArray = excludeRowIndex.ToArray(); + var data = new List(); + foreach (var s in _data) + { + var series = s.drop(excludeRowIndexArray); + data.Add(series); + } + return new DataFrame(data, columns: _columns); + } + + private bool HasNullValue(List data, int row) + { + return data.Any(x => x.IsNull(row)); + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.head.cs b/src/Pandas.NET/DataFrames/DataFrame.head.cs new file mode 100644 index 0000000..87f60b8 --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.head.cs @@ -0,0 +1,11 @@ +using Tensorflow; + +namespace PandasNet; + +public partial class DataFrame +{ + public DataFrame head(int n = 5) + { + return this[new Slice(0, n, 1)]; + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.math.cs b/src/Pandas.NET/DataFrames/DataFrame.math.cs new file mode 100644 index 0000000..202ab2e --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.math.cs @@ -0,0 +1,53 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; + +namespace PandasNet +{ + public partial class DataFrame + { + /// + /// Calculates the mean (average) of each column in the DataFrame. + /// + /// + /// A new Series where the index is the column names and the values are the means. + /// + public Series mean() + { + // Create a new Series for the index, using the names of the columns in the DataFrame + var index = new Series(_data.Select(x => x.column.Name).ToArray()); + + // Create a new Series for the data, using the mean of the values in each column + var series = new Series(_data.Select(x => x.mean()).ToArray()); + + // Set the index of the data Series to be the index Series we created earlier + series.SetIndex(index); + + // Return the data Series, which now has the column names as its index and the means as its values + return series; + } + + + /// + /// Calculates the standard deviation of each column in the DataFrame. + /// + /// + /// A new Series where the index is the column names and the values are the standard deviations. + /// + public Series std() + { + // Create a new Series for the index, using the names of the columns in the DataFrame + var index = new Series(_data.Select(x => x.column.Name).ToArray()); + + // Create a new Series for the data, using the standard deviation of the values in each column + var series = new Series(_data.Select(x => x.std()).ToArray()); + + // Set the index of the data Series to be the index Series we created earlier + series.SetIndex(index); + + // Return the data Series, which now has the column names as its index and the standard deviations as its values + return series; + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.operator.cs b/src/Pandas.NET/DataFrames/DataFrame.operator.cs new file mode 100644 index 0000000..d6bf53b --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.operator.cs @@ -0,0 +1,107 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; +using Tensorflow; + +namespace PandasNet +{ + public partial class DataFrame + { + public static DataFrame operator -(DataFrame a, Series b) + { + var data = new List(); + + for (int i = 0; i < a.data.Count; i++) + { + data.Add(a.data[i] - b.data switch + { + double[] double64 => double64[i], + float[] float32 => float32[i], + int[] int32 => int32[i], + _ => throw new NotImplementedException("") + }); + } + + return new DataFrame(data, index: a.index, columns: a.columns); + } + + public static DataFrame operator /(DataFrame a, Series b) + { + var data = new List(); + + for (int i = 0; i < a.data.Count; i++) + { + data.Add(a.data[i] / b.data switch + { + double[] double64 => double64[i], + float[] float32 => float32[i], + int[] int32 => int32[i], + _ => throw new NotImplementedException("") + }); + } + + return new DataFrame(data, index: a.index, columns: a.columns); + } + + public static DataFrame operator *(DataFrame a, Series b) + { + var data = new List(); + for(int i=0;i a.data[i].array().SequenceEqual(b.data[i].array()), + int[] => a.data[i].array().SequenceEqual(b.data[i].array()), + float[] => a.data[i].array().SequenceEqual(b.data[i].array()), + double[] => a.data[i].array().SequenceEqual(b.data[i].array()), + string[] => a.data[i].array().SequenceEqual(b.data[i].array()), + DateTime[] => a.data[i].array().SequenceEqual(b.data[i].array()), + _ => throw new NotImplementedException("") + }; + if (!matched) { return false; } + } + + return true; + } + + public static bool operator !=(DataFrame a, DataFrame b) + { + return !(a == b); + } + + public override bool Equals(object obj) + { + return obj is DataFrame frame && + data == frame.data && + columns == frame.columns && + index == frame.index; + } + + public override int GetHashCode() + { + return HashCode.Combine(data, columns, index); + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.pop.cs b/src/Pandas.NET/DataFrames/DataFrame.pop.cs new file mode 100644 index 0000000..80b226e --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.pop.cs @@ -0,0 +1,27 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; + +namespace PandasNet +{ + public partial class DataFrame + { + public Series pop(string columnName) + { + Series series = null; + + for (int i = 0; i< _columns.Count; i++) + { + if(_columns[i].Name == columnName) + { + series = _data[i]; + _columns.RemoveAt(i); + _data.RemoveAt(i); + } + } + + return series; + } + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.sample.cs b/src/Pandas.NET/DataFrames/DataFrame.sample.cs new file mode 100644 index 0000000..377e13e --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.sample.cs @@ -0,0 +1,76 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Reflection.PortableExecutable; +using Tensorflow.Util; + +namespace PandasNet; + +public partial class DataFrame +{ + public DataFrame sample(int n = 0, float frac = 0, int random_state = 0, bool replace = false) + { + if (n == 0 && frac == 0) + { + throw new ArgumentException("Either n or frac should be greater than 0"); + } + if (n != 0 && frac != 0) + { + throw new ArgumentException("Only one of n or frac should be greater than 0"); + } + if (frac > 0) + { + n = (int)Math.Ceiling(frac * _index.size); + } + if (n > _index.size) + { + throw new ArgumentException("n should be less than the size of the DataFrame"); + } + + // treat axis as 0 for now. support for axis=1 should be added in the future + var rnd = new Random(random_state); + + // make a list that we can sample from + List sampleIndex = null; + + if(!replace){ + // randomize the index and take the first n elements, no duplicates + sampleIndex = Enumerable + .Range(0, _index.size) + .OrderBy(arg => rnd.Next()) + .Take(n).ToList(); + } + else{ + // for each sample, randomly select an index allowing duplicates + var sampleIndexes = Enumerable.Range(0, _index.size); + for (int i = 0; i < n; i++) + { + sampleIndex.Add(sampleIndexes.ElementAt(rnd.Next(0, sampleIndexes.Count()-1))); + } + } + + // initialize a dictionary to hold the data + Dictionary data = new Dictionary(); + foreach (var s in _data) + { + // init the array based on the dtype + ArrayList array =new ArrayList(); + data.Add(s.column, array); + } + + // fill the arrays with the sampled data + for (int i = 0; i < sampleIndex.Count; i++) + { + foreach (var s in _data) + { + data[s.column].Add(s.data.GetValue(sampleIndex[i])); + } + } + + // create a new DataFrame with the sampled data + DataFrame df = new DataFrame(data.Select(x => new Series(x.Value.ToArray(x.Key.DType), x.Key)).ToList(), index: new Series(sampleIndex.ToArray())); + return df; + + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.tail.cs b/src/Pandas.NET/DataFrames/DataFrame.tail.cs new file mode 100644 index 0000000..fbb2817 --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.tail.cs @@ -0,0 +1,11 @@ +using Tensorflow; + +namespace PandasNet; + +public partial class DataFrame +{ + public DataFrame tail(int n = 5) + { + return this[new Slice(start: _data.Count - n, stop: _data.Count)]; + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.to_numpy.cs b/src/Pandas.NET/DataFrames/DataFrame.to_numpy.cs new file mode 100644 index 0000000..e46dfe2 --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.to_numpy.cs @@ -0,0 +1,24 @@ +using System.Collections.Generic; +using System.Linq; +using Tensorflow; +using Tensorflow.NumPy; + +namespace PandasNet; + +public partial class DataFrame +{ + public NDArray to_numpy() + { + var array = new float[_index.size, _data.Count]; + // loop column + for (var col = 0; col < _data.Count; col++) + { + var data = _data[col].array(); + for (var row = 0; row < _index.size; row++) + { + array[row, col] = data[row]; + } + } + return array; + } +} diff --git a/src/Pandas.NET/DataFrames/DataFrame.transpose.cs b/src/Pandas.NET/DataFrames/DataFrame.transpose.cs new file mode 100644 index 0000000..84a193e --- /dev/null +++ b/src/Pandas.NET/DataFrames/DataFrame.transpose.cs @@ -0,0 +1,38 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; + +namespace PandasNet +{ + public partial class DataFrame + { + public DataFrame transpose() + { + var index = new Series(_data.Select(x => x.name).ToArray()); + var data = new List(); + + for (var col = 0; col < _index.size; col++) + { + var series = new Series(new Column + { + DType = typeof(double), + Name = _index.GetValue(col).ToString() + }); + series.Allocate(_columns.Count); + series.SetIndex(index); + data.Add(series); + } + + for (var col = 0; col < _data.Count; col++) + { + for(var row = 0; row < _index.size; row++) + { + data[row].SetValue(_data[col].GetValue(row), col); + } + } + + return new DataFrame(data, index: index); + } + } +} diff --git a/src/Pandas.NET/ExtendedMethods/ObjectExtended.cs b/src/Pandas.NET/ExtendedMethods/ObjectExtended.cs new file mode 100644 index 0000000..7cc86ce --- /dev/null +++ b/src/Pandas.NET/ExtendedMethods/ObjectExtended.cs @@ -0,0 +1,14 @@ +using System; +using System.Collections.Generic; + +public static class ObjectExtended +{ + public static bool IsNumeric(this Object obj) + { + if(obj == null) + return false; + + Type type = obj.GetType(); + return type.IsNumericType(); + } +} \ No newline at end of file diff --git a/src/Pandas.NET/ExtendedMethods/TypeExtended.cs b/src/Pandas.NET/ExtendedMethods/TypeExtended.cs new file mode 100644 index 0000000..4f59076 --- /dev/null +++ b/src/Pandas.NET/ExtendedMethods/TypeExtended.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; + +public static class TypeExtended +{ + private static HashSet NumericTypes = new HashSet + { + typeof(Int16), + typeof(UInt16), + typeof(int), + typeof(uint), + typeof(long), + typeof(ulong), + typeof(double), + typeof(decimal), + typeof(float), + typeof(Single) + }; + + public static bool IsNumericType(this Type type) + { + if(type == null) + return false; + + return NumericTypes.Contains(type) || + NumericTypes.Contains(Nullable.GetUnderlyingType(type)); + } +} \ No newline at end of file diff --git a/src/Pandas.NET/Extensions/DataFrameMethods.cs b/src/Pandas.NET/Extensions/DataFrameMethods.cs deleted file mode 100644 index 0a0117a..0000000 --- a/src/Pandas.NET/Extensions/DataFrameMethods.cs +++ /dev/null @@ -1,49 +0,0 @@ -using PandasNet.Impl; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public static class DataFrameMethods - { - /// - /// - /// - /// - /// - /// 左边重叠字段的后缀 - /// 右边重叠字段的后缀 - /// 结果是否根据连接键进行排序 - /// - public static IDataFrame join(this IDataFrame df, IDataFrame other, string lsuffix = "", string rsuffix = "", bool sort = false) - { - throw new NotImplementedException(); - } - - #region groupby - - /// - /// 分组 - /// - /// - /// - /// - public static IGroupBy groupby(this IDataFrame df, string key) - { - return new DataFrameGroupBy(df, new Grouper(key)); - } - - /// - /// 分组 - /// - /// - /// - /// - public static IGroupBy groupby(this IDataFrame df, IGrouper grouper) - { - return new DataFrameGroupBy(df, grouper); - } - #endregion - } -} diff --git a/src/Pandas.NET/Extensions/PandasFactory.cs b/src/Pandas.NET/Extensions/PandasFactory.cs deleted file mode 100644 index 1b8f7bf..0000000 --- a/src/Pandas.NET/Extensions/PandasFactory.cs +++ /dev/null @@ -1,90 +0,0 @@ -using NumSharp.Core; -using PandasNet.Impl; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace PandasNet -{ - public static class PandasFactory - { - #region Series - /// - /// - /// - /// - /// - /// - /// - public static SeriesBase Series(this Pandas pd, T[] data, IDataIndex index = null) - where T : struct - { - Series series = new Series(data) - { - Index = index - }; - return series; - } - - public static SeriesBase Series(this Pandas pd, T data, IDataIndex index = null) - { - Series res = null; - Type type = data.GetType(); - var properties = type.GetProperties(); - var nd = properties.Select(x => x.GetValue(data)).ToArray(); - res = new Series(nd) - { - Index = new DataIndex(properties.Select(x => x.Name).ToArray()) - }; - return res; - } - - - #endregion - - #region DataFrame - public static IDataFrame DataFrame(this Pandas pd, NDArray data, IList index, IList columns, Type dtype) - { - return new DataFrame(data, index, columns, dtype); - } - - public static IDataFrame DataFrame(this Pandas pd, NDArray data, IList index, IList columns, Type dtype) - { - return pd.DataFrame(data, index, columns, dtype); - } - - public static IDataFrame DataFrame(this Pandas pd, IDictionary data, IList index = null) - { - return pd.DataFrame(data, index); - } - - public static IDataFrame DataFrame(this Pandas pd, IDictionary data, IList index = null) - { - return new DataFrame(data, index); - } - - public static IDataFrame DataFrame(this Pandas pd, IList data, IList index = null) - { - var type = typeof(T); - if(type.FullName == "System.Object") - { - type = data[0].GetType(); - } - var props = type.GetProperties(); - var columnSize = props.Count(); - var rowSize = data.Count(); - var nd = new NDArray(typeof(object), new Shape(rowSize, columnSize)); - for (var i = 0; i < rowSize; i++) - { - for (var p = 0; p < columnSize; p++) - { - nd[i, p] = props[p].GetValue(data[i]); - } - } - return pd.DataFrame(nd, index, props.Select(x => x.Name).ToArray(), typeof(object)); - } - - #endregion - } -} diff --git a/src/Pandas.NET/Extensions/PandasMethods.Excel.cs b/src/Pandas.NET/Extensions/PandasMethods.Excel.cs deleted file mode 100644 index 1efb4b6..0000000 --- a/src/Pandas.NET/Extensions/PandasMethods.Excel.cs +++ /dev/null @@ -1,22 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public static class PandasMethods - { - /// - /// - /// - /// - /// - /// - /// - /// - public static IDataFrame read_csv(this Pandas pd, string filepath, string sep = ",", string[] names = null) - { - throw new NotImplementedException(); - } - } -} diff --git a/src/Pandas.NET/IColumnIndexable.cs b/src/Pandas.NET/IColumnIndexable.cs deleted file mode 100644 index edd2f15..0000000 --- a/src/Pandas.NET/IColumnIndexable.cs +++ /dev/null @@ -1,19 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public interface IColumnIndexable - { - IDataIndex Columns { get; } - - SeriesBase this[string columnLabel] { get; set; } - - IDataFrame this[params string[] columnLabels] { get; } - - SeriesBase this[int columnIndex] { get; } - - IDataFrame this[params int[] columnIndexs] { get; } - } -} diff --git a/src/Pandas.NET/IDataFrame.cs b/src/Pandas.NET/IDataFrame.cs deleted file mode 100644 index 286ac5c..0000000 --- a/src/Pandas.NET/IDataFrame.cs +++ /dev/null @@ -1,47 +0,0 @@ -using NumSharp.Core; -using PandasNet.Iteration; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public interface IDataFrame : IPandasObject, IRowIndexable, IColumnIndexable, ISliceable - { - ILoc loc { get; } - - IILoc iloc { get; } - - IDataFrame Head(int rowSize); - - /// - /// 设置列以及列的值;当列不存在时创建 - /// - /// - /// - /// - void SingleColumn(string columnLabel, T value); - - /// - /// 设置列以及列的值;当列不存在时报异常 - /// - /// - /// - /// - void SingleColumn(int columnIndex, T value); - - /// - /// 设置列以及列的值;当列不存在时创建 - /// - /// - /// - void Column(string columnLabel, NDArray value); - - /// - /// 设置列以及列的值;当列不存在时报异常 - /// - /// - /// - void Column(int columnIndex, NDArray value); - } -} diff --git a/src/Pandas.NET/IDataIndex.cs b/src/Pandas.NET/IDataIndex.cs deleted file mode 100644 index 5589e37..0000000 --- a/src/Pandas.NET/IDataIndex.cs +++ /dev/null @@ -1,25 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public interface IDataIndex: IPandasObject - { - /// - /// 获取标签值的索引下标 - /// - /// - /// - /// - int GetPosition(T key); - - /// - /// 获取标签值的索引下标 - /// - /// - /// - /// - IEnumerable GetPosition(params T[] keys); - } -} diff --git a/src/Pandas.NET/IGroupBy.cs b/src/Pandas.NET/IGroupBy.cs deleted file mode 100644 index cca7eaf..0000000 --- a/src/Pandas.NET/IGroupBy.cs +++ /dev/null @@ -1,29 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - /// - /// 分组结果 - /// - public interface IGroupBy - { - /// - /// 分组后值与标签的结果 - /// - Dictionary Groups { get; } - - /// - /// 分组后值与索引的结果 - /// - Dictionary Indices { get; } - - /// - /// 从分组结果中获取指定分组key的结果集 - /// - /// - /// - IDataFrame GetGroup(object name); - } -} diff --git a/src/Pandas.NET/IGrouper.cs b/src/Pandas.NET/IGrouper.cs deleted file mode 100644 index c985562..0000000 --- a/src/Pandas.NET/IGrouper.cs +++ /dev/null @@ -1,31 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public interface IGrouper - { - string Level { get; } - - /// - /// 分组关键字 - /// - string Key { get; } - - /// - /// 如果目标选择(通过键或级别)是类似日期时间的对象,则将按指定的频率进行分组 - /// - string Freq { get; } - - /// - /// 分组的轴,默认0(行) - /// - int Axis { get; } - - /// - /// 是否排序,默认false; - /// - bool Sort { get; } - } -} diff --git a/src/Pandas.NET/IPandasObject.cs b/src/Pandas.NET/IPandasObject.cs deleted file mode 100644 index 8c742ca..0000000 --- a/src/Pandas.NET/IPandasObject.cs +++ /dev/null @@ -1,15 +0,0 @@ -using System; -using NumSharp.Core; - -namespace PandasNet -{ - public interface IPandasObject - { - Type DType { get; } - object Name { get; set; } - int NDIM { get; } - Shape Shape { get; } - int Size { get; } - NDArray Values { get; set; } - } -} \ No newline at end of file diff --git a/src/Pandas.NET/IRowIndexable.cs b/src/Pandas.NET/IRowIndexable.cs deleted file mode 100644 index f961d04..0000000 --- a/src/Pandas.NET/IRowIndexable.cs +++ /dev/null @@ -1,16 +0,0 @@ -using PandasNet.Impl; -using PandasNet.Iteration; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - /// - /// 允许行索引 - /// - public interface IRowIndexable - { - IDataIndex Index { get; } - } -} diff --git a/src/Pandas.NET/ISliceable.cs b/src/Pandas.NET/ISliceable.cs deleted file mode 100644 index be470d6..0000000 --- a/src/Pandas.NET/ISliceable.cs +++ /dev/null @@ -1,14 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - /// - /// 允许切片 - /// - public interface ISliceable - { - T this[Slice s] { get; } - } -} diff --git a/src/Pandas.NET/Impl/DataFrame.Construct.cs b/src/Pandas.NET/Impl/DataFrame.Construct.cs deleted file mode 100644 index 05ed313..0000000 --- a/src/Pandas.NET/Impl/DataFrame.Construct.cs +++ /dev/null @@ -1,144 +0,0 @@ -using NumSharp.Core; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace PandasNet.Impl -{ - public partial class DataFrame - { - private readonly IList _rawIndex; - private readonly IList _rawColumns; - private readonly int _rowSize; - private readonly Type _dtype; - - public DataFrame(NDArray data, IList index, IList columns, Type dtype) - { - this._rawIndex = index; - this._rowSize = data.shape[0]; - this._rawColumns = columns; - this._dtype = dtype; - - this.Values = data; - this.CreateRowIndex(); - this.CreateColumnIndex(); - } - - public DataFrame(IDictionary data, IList index = null) - { - _rawIndex = index; - _rowSize = data.First().Value.shape[0]; - if (index != null && _rowSize != index.Count) - { - throw new ArgumentException("index的长度非法"); - } - _rawColumns = data.Keys.ToList(); - var shape = new Shape(_rowSize, 1); - var nds = data.Values.Select(x => x.reshape(shape)).ToArray(); - foreach (var nd in nds) - { - if (Values is null) - { - Values = nd; - } - else - { - Values = Values.hstack(nd); - } - } - this.CreateRowIndex(); - this.CreateColumnIndex(); - } - - protected virtual void AddColumnLabel(string columnName) - { - var cols = Columns.Values.Storage.GetData().AsQueryable().Select(x => x).Concat(new string[] { columnName }) - .ToArray(); - Columns.Values.Storage.Allocate(Columns.Values.Storage.DType, new Shape(cols.Length)); - Columns.Values.Storage.SetData(cols); - Columns.Values.Storage.ChangeTensorLayout(1); - _rawColumns.Add(columnName); - } - - protected virtual void AddColumn(string columnName, NDArray value) - { - AddColumnLabel(columnName); - AddColumnValue(value as NDArray); - } - - protected virtual void AddColumn(string columnName, SeriesBase value) - { - AddColumnLabel(columnName); - AddColumnValue(value); - } - - protected virtual void AddColumnValue(T value) - { - var insertValues = new NDArray(typeof(T), new Shape(_rowSize, 1)); - for (var i = 0; i < _rowSize; i++) - { - insertValues[i, 0] = value; - } - AddColumnValue(insertValues); - } - - protected virtual void AddColumnValue(NDArray value) - { - if (value.size != _rowSize) - { - throw new ArgumentException("输入的数组长度不等于dataframe行数"); - } - var insertValues = value.reshape(new Shape(_rowSize, 1)); - Values = Values.hstack(insertValues); - } - - protected virtual void AddColumnValue(SeriesBase value) - { - var insertValues = value.Values.reshape(new Shape(_rowSize, 1)); - AddColumnValue(insertValues); - } - - protected virtual void CreateRowIndex() - { - DataIndex index = null; - if (_rawIndex == null) - { - index = new DataIndex(np.arange(_rowSize)); - } - else - { - if (_rawIndex.Count != _rowSize) - throw new ArgumentException("传入的行标签与传入的行数不一致!"); - Type indexType = typeof(TIndex); - switch (indexType.Name) - { - case ("Int32"): - index = new DataIndex(_rawIndex.Select(x => Convert.ToInt32(x)).ToArray()); - break; - case "String": - index = new DataIndex(_rawIndex.Select(x => x.ToString()).ToArray()); - break; - case "Object": - index = new DataIndex(_rawIndex.Select(x => x).ToArray()); - break; - } - } - Index = index; - } - protected virtual void CreateColumnIndex() - { - DataIndex index = null; - if (_rawColumns == null) - { - index = new DataIndex(np.arange(Values.shape.Length)); - } - else - { - index = new DataIndex(_rawColumns.Select(x => x.ToString()).ToArray()); - } - Columns = index; - } - - } -} diff --git a/src/Pandas.NET/Impl/DataFrame.cs b/src/Pandas.NET/Impl/DataFrame.cs deleted file mode 100644 index 6d60be8..0000000 --- a/src/Pandas.NET/Impl/DataFrame.cs +++ /dev/null @@ -1,221 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using NumSharp.Core; -using PandasNet.Iteration; -using PandasNet.Iteration.Impl; - -namespace PandasNet.Impl -{ - public partial class DataFrame : PandasObject, IDataFrame - { - public IDataIndex Index { get; internal set; } - - public IDataIndex Columns { get; internal set; } - - /// - /// 返回ISeries, 设置可为单个值也可是数组 - /// - /// - /// - public SeriesBase this[string columnLabel] - { - get - { - var columnIndex = Columns.GetPosition(columnLabel); - return this[columnIndex]; - } - set - { - var columnIndex = Columns.GetPosition(columnLabel); - if (columnIndex == -1) - { - AddColumn(columnLabel, value); - } - else - { - this[columnIndex] = value; - } - } - } - - public SeriesBase this[int columnIndex] - { - get - { - NDArray array = new NDArray(typeof(object), new Shape(_rowSize)); - for (var rowIndex = 0; rowIndex < _rowSize; rowIndex++) - { - array[rowIndex] = Values[rowIndex, columnIndex]; - } - var columnLabel = Columns.Values[columnIndex]; - return new Series(array) - { - Name = columnLabel, - Index = Index - }; - } - set - { - SetColumnValue(columnIndex, value); - } - } - - #region 列设置为单个值 - - public void SingleColumn(string columnLabel, T singleValue) - { - var columnIndex = Columns.GetPosition(columnLabel); - if (columnIndex == -1) - { - AddColumnLabel(columnLabel); - AddColumnValue(singleValue); - } - else - { - SetColumnValue(columnIndex, singleValue); - } - } - - public void SingleColumn(int columnIndex, T singleValue) - { - SetColumnValue(columnIndex, singleValue); - } - - protected void SetColumnValue(int columnIndex, T singleValue) - { - for (var i = 0; i < _rowSize; i++) - { - Values[i, columnIndex] = singleValue; - } - } - - #endregion - - #region 列设置为类数组值 - public void Column(string columnLabel, NDArray value) - { - var columnIndex = Columns.GetPosition(columnLabel); - if (columnIndex == -1) - { - AddColumnLabel(columnLabel); - AddColumnValue(value); - } - else - { - Column(columnIndex, value); - } - } - - public void Column(int columnIndex, NDArray value) - { - if (value.size != _rowSize) - { - throw new ArgumentException("输入数组的元素格式不等于dataframe的行数"); - } - for (var i = 0; i < _rowSize; i++) - { - Values[i, columnIndex] = value[i]; - } - } - - protected void SetColumnValue(int columnIndex, SeriesBase value) - { - Column(columnIndex, value.Values); - } - #endregion - - public IDataFrame this[params string[] columnLabels] - { - get - { - var columnIndexs = Columns.GetPosition(columnLabels).ToArray(); - return this[columnIndexs]; - } - } - - public IDataFrame this[params int[] columnIndexs] - { - get - { - var colLength = columnIndexs.Length; - NDArray array = new object[_rowSize, colLength]; - for (var rowIndex = 0; rowIndex < _rowSize; rowIndex++) - { - for (var col = 0; col < colLength; col++) - { - array[rowIndex, col] = Values[rowIndex, columnIndexs.ElementAt(col)]; - } - } - if (colLength == 1) - { - array = array.reshape(new Shape(_rowSize)); - } - var columnLabels = columnIndexs.Select(x => Columns.Values[x].ToString()).ToList(); - var result = new DataFrame(array, this._rawIndex, columnLabels, null); - return result; - } - } - - private DataFrameLoc _loc = null; - public ILoc loc - { - get - { - if (_loc == null) - { - _loc = new DataFrameLoc(this); - } - return _loc; - } - } - -#pragma warning disable IDE1006 // 命名样式 - public IILoc iloc -#pragma warning restore IDE1006 // 命名样式 - { - get - { - if (_loc == null) - { - _loc = new DataFrameLoc(this); - } - return _loc; - } - } - - /// - /// 切片 - /// - /// - /// - public IDataFrame this[Slice s] { - get - { - List objs = new List(); - NDArray nDArray = null; - //验证长度 - var lengthVail = Values.size / Values.ndim; - var length = s.End; - if (s.End > lengthVail) - length = lengthVail; - for (int i=s.Start;i< length; i+=s.Step) - { - var value = Values[i] as NDArray; - objs.AddRange(value.Storage.GetData()); - } - nDArray=np.array(objs.ToArray(),Values.Storage.DType); - nDArray.reshape(objs.Count/ Values.ndim, Values.ndim); - var result= new DataFrame(nDArray, null, _rawColumns,_dtype); - return result; - } - - } - - public IDataFrame Head(int rowSize) - { - throw new NotImplementedException(); - } - } -} diff --git a/src/Pandas.NET/Impl/DataFrameGroupBy.cs b/src/Pandas.NET/Impl/DataFrameGroupBy.cs deleted file mode 100644 index 5314272..0000000 --- a/src/Pandas.NET/Impl/DataFrameGroupBy.cs +++ /dev/null @@ -1,112 +0,0 @@ -using NumSharp.Core; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace PandasNet.Impl -{ - public class DataFrameGroupBy : IGroupBy - { - private IDataFrame _dataFrame; - private IGrouper _grouper; - private Dictionary _groups; - private Dictionary _indices; - private SeriesBase _keySeries; - private Dictionary> _groupIndices; - - public DataFrameGroupBy(IDataFrame dataFrame, IGrouper grouper) - { - _dataFrame = dataFrame; - _grouper = grouper; - } - - public Dictionary Groups - { - get - { - - ExcuteGrouper(); - if (_groups == null) - { - _groups = _groupIndices.ToDictionary(x => x.Key, y => - { - var labels = y.Value.Select(z => _keySeries.Index.Values[z].ToString()).ToArray(); - return (new DataIndex(labels)) as IDataIndex; - }); - } - return _groups; - } - } - - public Dictionary Indices - { - get - { - ExcuteGrouper(); - if (_indices == null) - { - _indices = _groupIndices.ToDictionary(x => x.Key, y => new DataIndex(y.Value.ToArray()) as IDataIndex); - } - return _indices; - } - } - - public IDataFrame GetGroup(object name) - { - var indices = Indices[name].Values; - var size = indices.size; - var colSize = _dataFrame.Columns.Size; - var array = new NDArray(typeof(object), new Shape(size, colSize)); - var labels = new object[size]; - for (var i = 0; i < size; i++) - { - var index = Convert.ToInt32(indices[i]); - var row = _dataFrame.iloc[index]; - for (var j = 0; j < colSize; j++) - { - array[i, j] = row[j]; - } - - labels[i] = _keySeries.Index.Values[index]; - } - - return new DataFrame(array, labels, _dataFrame.Columns.Values.Storage.GetData(), typeof(object)); - } - - protected virtual void ExcuteGrouper() - { - if (_groupIndices != null) - { - return; - } - _indices = new Dictionary(); - var rowSize = _dataFrame.Index.Size; - - //TODO:未完成列轴 - if (_grouper.Axis == 0) - { - _keySeries = _dataFrame[_grouper.Key]; - } - - _groupIndices = new Dictionary>(); - for (var i = 0; i < rowSize; i++) - { - var currentKey = _keySeries[i]; - if (_groupIndices.ContainsKey(currentKey)) - { - _groupIndices[currentKey].Add(i); - } - else - { - var list = new List - { - i - }; - _groupIndices.Add(currentKey, list); - } - } - - } - } -} diff --git a/src/Pandas.NET/Impl/DataIndex.cs b/src/Pandas.NET/Impl/DataIndex.cs deleted file mode 100644 index 27deb55..0000000 --- a/src/Pandas.NET/Impl/DataIndex.cs +++ /dev/null @@ -1,46 +0,0 @@ -using NumSharp.Core; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet.Impl -{ - public class DataIndex : PandasObject, IDataIndex - { - public DataIndex(NDArray array) - { - Values = array; - } - - public DataIndex(params int[] items) - { - Values = new NDArray(typeof(int), items.Length); - } - - public DataIndex(params string[] items) - { - Values = new NDArray(typeof(string), items.Length); - Values.Storage.SetData(items); - } - - public int GetPosition(T key) - { - var pos = Array.IndexOf(Values.Data(), key); - return pos; - } - - public IEnumerable GetPosition(params T[] keys) - { - var values = Values.Data(); - if (values == null) - { - yield break; - } - foreach (var idx in keys) - { - var pos = Array.IndexOf(values, idx); - yield return pos; - } - } - } -} diff --git a/src/Pandas.NET/Impl/Grouper.cs b/src/Pandas.NET/Impl/Grouper.cs deleted file mode 100644 index 54ebb57..0000000 --- a/src/Pandas.NET/Impl/Grouper.cs +++ /dev/null @@ -1,43 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet.Impl -{ - public class Grouper : IGrouper - { - /// - /// - /// - /// 分组关键字 - /// - /// 如果目标选择(通过键或级别)是类似日期时间的对象,则将按指定的频率进行分组 - /// 0:行轴,1:列轴 - /// - /// - public Grouper(string key, string level = null, string freq = null, int axis = 0, bool sort = false) - { - Key = key; - Level = level; - Freq = freq; - Axis = axis; - Sort = sort; - } - - public string Level { get; } - - public string Key { get; } - - /// - /// 如果目标选择(通过键或级别)是类似日期时间的对象,则将按指定的频率进行分组 - /// - public string Freq { get; } - - /// - /// 分组的轴,默认0(行) - /// - public int Axis { get; } - - public bool Sort { get; } - } -} diff --git a/src/Pandas.NET/Impl/Series.cs b/src/Pandas.NET/Impl/Series.cs deleted file mode 100644 index 107f608..0000000 --- a/src/Pandas.NET/Impl/Series.cs +++ /dev/null @@ -1,51 +0,0 @@ -using NumSharp.Core; -using PandasNet.Iteration; -using PandasNet.Iteration.Impl; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet.Impl -{ - public class Series : SeriesBase - { - public Series(NDArray nd) - { - Values = nd; - } - - public override object this[int index] - { - get - { - return Values[index]; - } - set - { - Values[index] = value; - } - } - - public override object this[string idx] - { - get - { - int pos = -1; - pos = Array.IndexOf(Index.Values.Data(), idx); - return Values[pos]; - } - } - - public override SeriesBase AsType(bool copy = true) - { - throw new NotImplementedException(); - } - - //public static implicit operator Series(int x) - //{ - // var series = new Series(new int[] { x }); - // return series; - //} - - } -} diff --git a/src/Pandas.NET/Iteration/IILoc.cs b/src/Pandas.NET/Iteration/IILoc.cs deleted file mode 100644 index c35c201..0000000 --- a/src/Pandas.NET/Iteration/IILoc.cs +++ /dev/null @@ -1,14 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet.Iteration -{ - /// - /// 基于整数位置的索引 - /// - public interface IILoc - { - SeriesBase this[int row] { get; } - } -} diff --git a/src/Pandas.NET/Iteration/ILoc.cs b/src/Pandas.NET/Iteration/ILoc.cs deleted file mode 100644 index 58c5ae8..0000000 --- a/src/Pandas.NET/Iteration/ILoc.cs +++ /dev/null @@ -1,35 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet.Iteration -{ - /// - /// 基于标签位置的索引器 - /// - public interface ILoc - { - /// - /// 单一标签(行) 作为ISeries返回。 - /// - /// - /// - SeriesBase this[string rowLabel] - { - get; - } - - /// - /// 行和列的单个标签 - /// - /// - /// - /// - object this[string rowLabel, string columnLabel] - { - get; - } - - IDataFrame this[string[,] rowAndColumnLabels] { get; } - } -} diff --git a/src/Pandas.NET/Iteration/IPandasEnumerable.cs b/src/Pandas.NET/Iteration/IPandasEnumerable.cs deleted file mode 100644 index 95bb625..0000000 --- a/src/Pandas.NET/Iteration/IPandasEnumerable.cs +++ /dev/null @@ -1,17 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet.Iteration -{ - /// - /// 索引,迭代接口 - /// - public interface IIndexingAndIteration : ILoc - { - SeriesBase Get(int key); - SeriesBase Get(string key); - - object At(int row, int column); - } -} diff --git a/src/Pandas.NET/Iteration/Impl/DataFrameLoc.cs b/src/Pandas.NET/Iteration/Impl/DataFrameLoc.cs deleted file mode 100644 index a65a0ae..0000000 --- a/src/Pandas.NET/Iteration/Impl/DataFrameLoc.cs +++ /dev/null @@ -1,48 +0,0 @@ -using NumSharp.Core; -using PandasNet.Impl; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet.Iteration.Impl -{ - public class DataFrameLoc : ILoc, IILoc - { - private IDataFrame _dataFrame; - - public DataFrameLoc(IDataFrame dataFrame) - { - _dataFrame = dataFrame; - } - public SeriesBase this[string rowLabel] - { - get - { - var rowIndex = _dataFrame.Index.GetPosition(rowLabel); - return this[rowIndex]; - } - } - - public IDataFrame this[string[,] rowAndColumnLabels] => throw new NotImplementedException(); - - public SeriesBase this[int row] - { - get - { - var colLength = _dataFrame.Columns.Size; - NDArray array = new object[colLength]; - for (var i = 0; i < colLength; i++) - { - array[i] = _dataFrame.Values[row, i]; - } - return new Series(array) - { - Name = _dataFrame.Index.Values.Storage.GetData()[row], - Index = _dataFrame.Columns - }; - } - } - - public object this[string rowLabel, string columnLabel] => throw new NotImplementedException(); - } -} diff --git a/src/Pandas.NET/Methods/Pandas.arange.cs b/src/Pandas.NET/Methods/Pandas.arange.cs new file mode 100644 index 0000000..da2a4c4 --- /dev/null +++ b/src/Pandas.NET/Methods/Pandas.arange.cs @@ -0,0 +1,17 @@ +using System; +using System.Linq; + +namespace PandasNet +{ + public partial class Pandas + { + public T[] arange(int start = 0, int stop = 0, int step = 1) + { + return typeof(T).Name switch + { + "Int32" => Enumerable.Range(0, stop).Select(x => (T)Convert.ChangeType(x, TypeCode.Int32)).ToArray(), + _ => throw new NotImplementedException("") + }; + } + } +} diff --git a/src/Pandas.NET/Methods/Pandas.convert.cs b/src/Pandas.NET/Methods/Pandas.convert.cs new file mode 100644 index 0000000..106bd81 --- /dev/null +++ b/src/Pandas.NET/Methods/Pandas.convert.cs @@ -0,0 +1,46 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; +using System.Globalization; +using OneOf.Types; + +namespace PandasNet +{ + public partial class Pandas + { + public int int32(double value) => Convert.ToInt32(Math.Floor(value)); + + public T[,] array(DataFrame df) + { + var data = new T[df.index.size, df.columns.Count]; + var shape = df.shape; + + for (int col = 0; col < shape[1]; col++) + { + if (df.data[col].data is T[] buffer) + { + for (int row = 0; row < shape[0]; row++) + data[row, col] = buffer[row]; + } + } + return data; + } + + public Toutput[,] array(DataFrame df) + { + var data = new Toutput[df.index.size, df.columns.Count]; + var shape = df.shape; + + for (int col = 0; col < shape[1]; col++) + { + if (df.data[col].data is Tinput[] buffer) + { + for (int row = 0; row < shape[0]; row++) + data[row, col] = (Toutput)Convert.ChangeType(buffer[row], typeof(Toutput)); + } + } + return data; + } + } +} diff --git a/src/Pandas.NET/Methods/Pandas.get_dummies.cs b/src/Pandas.NET/Methods/Pandas.get_dummies.cs new file mode 100644 index 0000000..781ec82 --- /dev/null +++ b/src/Pandas.NET/Methods/Pandas.get_dummies.cs @@ -0,0 +1,40 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PandasNet; + +public partial class Pandas +{ + public DataFrame get_dummies(DataFrame data, + string[] columns = null, + string prefix = "", + string prefix_sep = "") + { + foreach (var colName in columns) + { + // remove column + var column = data.columns.First(x => x.Name == colName); + data.columns.Remove(column); + var series = data.data.First(x => x.column.Name == colName); + data.data.Remove(series); + + // expand columns + var newCols = new List(); + var newData = new List(); + var values = series.array(); + foreach (var col in values.Distinct()) + { + var array = values.Select(x => x == col ? 1 : 0).ToArray(); + var newColumn = new Column { Name = col, DType = typeof(int) }; + var newSeries = new Series(array, data.index, newColumn); + newCols.Insert(0, newColumn); + newData.Insert(0, newSeries); + } + + data.columns.AddRange(newCols); + data.data.AddRange(newData); + } + return data; + } +} diff --git a/src/Pandas.NET/Methods/Pandas.math.cs b/src/Pandas.NET/Methods/Pandas.math.cs new file mode 100644 index 0000000..f1a506e --- /dev/null +++ b/src/Pandas.NET/Methods/Pandas.math.cs @@ -0,0 +1,16 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; +using System.Globalization; + +namespace PandasNet +{ + public partial class Pandas + { + public float pi => (float)Math.PI; + + public Series sin(Series series) => series.sin(); + public Series cos(Series series) => series.cos(); + } +} diff --git a/src/Pandas.NET/Methods/Pandas.read_csv.cs b/src/Pandas.NET/Methods/Pandas.read_csv.cs new file mode 100644 index 0000000..7145687 --- /dev/null +++ b/src/Pandas.NET/Methods/Pandas.read_csv.cs @@ -0,0 +1,105 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using System.Linq; +using System.Text.RegularExpressions; +using PandasNet.Utils; + +namespace PandasNet; + +public partial class Pandas +{ + public DataFrame read_csv(string path, + string[] names = null, + char sep = ',', + char? na_values = null, + int header = 1, + char? comment = null, + bool skipinitialspace = false) + { + // Download for web + if (IsWebUrl(path)) + { + var fileName = path.Split('/').Last(); + string data_dir = Path.GetTempPath(); + Web.Download(path, data_dir, fileName); + path = Path.Combine(data_dir, fileName); + } + + if (names != null) + { + header = 0; + } + + var rows = File.ReadAllLines(path); + var columns = names?.Select(x => new Column + { + Name = x + }).ToList() ?? rows[0].Split(sep).Select(x => new Column + { + Name = x.Trim('\"') + }).ToList(); + var index = new Series(Enumerable.Range(0, rows.Length).ToArray()); + + // add columns + var data = new List(); + for (int col = 0; col < columns.Count; col++) + { + columns[col].DType = InferDataType(rows[header], col, sep); + var series = new Series(columns[col]); + series.Allocate(rows.Length); + series.SetIndex(index); + data.Add(series); + } + + // set values + for (int row = header; row < rows.Length; row++) + { + var str = rows[row]; + if (comment.HasValue) + { + str = str.Split(comment.Value).First(); + } + + if (skipinitialspace) + { + str = Regex.Replace(str, "( )+", " "); + } + + var values = str.Split(sep); + for (int col = 0; col < columns.Count; col++) + { + var val = values[col]; + if (na_values.HasValue && val.Equals(na_values.ToString())) + { + data[col].SetNull(row); + break; + } + data[col].SetValue(val, row); + } + } + + return new DataFrame(data, index: index, columns: columns); + } + + Type InferDataType(string row, int col, char sep) + { + var val = row.Split(sep) + .Where(x => !string.IsNullOrEmpty(x)) + .ToArray()[col]; + if (int.TryParse(val, out var _)) + return typeof(int); + else if (float.TryParse(val, out var _)) + return typeof(float); + else if (double.TryParse(val, out var _)) + return typeof(double); + return typeof(string); + } + + bool IsWebUrl(string url) + { + return url.StartsWith("http://", StringComparison.OrdinalIgnoreCase) + || url.StartsWith("https://", StringComparison.OrdinalIgnoreCase); + } +} diff --git a/src/Pandas.NET/Methods/Pandas.slice.cs b/src/Pandas.NET/Methods/Pandas.slice.cs new file mode 100644 index 0000000..d5a9b55 --- /dev/null +++ b/src/Pandas.NET/Methods/Pandas.slice.cs @@ -0,0 +1,14 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; +using System.Globalization; +using Tensorflow; + +namespace PandasNet +{ + public partial class Pandas + { + public Slice slice(int start, int? stop = null, int step = 1) => new Slice(start, stop, step); + } +} diff --git a/src/Pandas.NET/Methods/Pandas.to_datetime.cs b/src/Pandas.NET/Methods/Pandas.to_datetime.cs new file mode 100644 index 0000000..619e74a --- /dev/null +++ b/src/Pandas.NET/Methods/Pandas.to_datetime.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Linq; +using System.Globalization; + +namespace PandasNet +{ + public partial class Pandas + { + public Series to_datetime(Series series, string format = null) + { + var data = new DateTime[series.size]; + for (int i = 0; i < data.Length; i++) + { + data[i] = DateTime.ParseExact(series.data.GetValue(i).ToString(), format, CultureInfo.InvariantCulture); + } + + return new Series(data); + } + + readonly DateTime Epoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); + public float timestamp(DateTime value) + { + TimeSpan elapsedTime = value - Epoch; + return (float)elapsedTime.TotalSeconds; + } + } +} diff --git a/src/Pandas.NET/Pandas - Backup.Net.csproj.backup b/src/Pandas.NET/Pandas - Backup.Net.csproj.backup new file mode 100644 index 0000000..3703e90 --- /dev/null +++ b/src/Pandas.NET/Pandas - Backup.Net.csproj.backup @@ -0,0 +1,32 @@ + + + + netstandard2.0;net6.0 + PandasNet + 0.3.0 + 9.0 + true + Haiping Chen + SciSharp STACK + Apache 2.0 + LICENSE + https://github.com/SciSharp + https://avatars3.githubusercontent.com/u/44989469?s=200&v=4 + https://github.com/SciSharp/Pandas.NET + git + Pandas + a0de1a80-b5db-4270-a191-01beeef4427b + 0.3.0.0 + 0.3.0.0 + + + + DEBUG;TRACE + + + + + + + + diff --git a/src/Pandas.NET/Pandas.NET.csproj b/src/Pandas.NET/Pandas.NET.csproj new file mode 100644 index 0000000..e496771 --- /dev/null +++ b/src/Pandas.NET/Pandas.NET.csproj @@ -0,0 +1,27 @@ + + + net8.0 + PandasNet + 0.6.0 + 10.0 + true + Haiping Chen + SciSharp STACK + Apache 2.0 + LICENSE + https://github.com/SciSharp + https://avatars3.githubusercontent.com/u/44989469?s=200&v=4 + https://github.com/SciSharp/Pandas.NET + git + Pandas + + + DEBUG;TRACE + + + + + + + + \ No newline at end of file diff --git a/src/Pandas.NET/Pandas.Net.csproj b/src/Pandas.NET/Pandas.Net.csproj deleted file mode 100644 index 17a8d85..0000000 --- a/src/Pandas.NET/Pandas.Net.csproj +++ /dev/null @@ -1,23 +0,0 @@ - - - - netstandard2.0 - PandasNet - 0.1.0 - true - Zhikui Hua, Haiping Chen - SciSharp STACK - Apache 2.0 - LICENSE - https://github.com/SciSharp - https://avatars3.githubusercontent.com/u/44989469?s=200&v=4 - https://github.com/SciSharp/Pandas.NET - git - Pandas - - - - - - - diff --git a/src/Pandas.NET/Pandas.cs b/src/Pandas.NET/Pandas.cs index ac6b69d..8c88b6b 100644 --- a/src/Pandas.NET/Pandas.cs +++ b/src/Pandas.NET/Pandas.cs @@ -4,7 +4,8 @@ namespace PandasNet { - public class Pandas + public partial class Pandas { + public DataFrameApi DataFrame = new DataFrameApi(); } } diff --git a/src/Pandas.NET/IGroupbyable.cs b/src/Pandas.NET/PandasApi.cs similarity index 51% rename from src/Pandas.NET/IGroupbyable.cs rename to src/Pandas.NET/PandasApi.cs index 0c92d23..fcb1d2f 100644 --- a/src/Pandas.NET/IGroupbyable.cs +++ b/src/Pandas.NET/PandasApi.cs @@ -4,11 +4,8 @@ namespace PandasNet { - /// - /// 允许分组 - /// - public interface IGroupbyable + public static class PandasApi { - + public static Pandas pd { get; } = new Pandas(); } } diff --git a/src/Pandas.NET/PandasObject.cs b/src/Pandas.NET/PandasObject.cs deleted file mode 100644 index 9efcf50..0000000 --- a/src/Pandas.NET/PandasObject.cs +++ /dev/null @@ -1,34 +0,0 @@ -using NumSharp.Core; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public abstract class PandasObject : IPandasObject - { - public object Name { get; set; } - - public NDArray Values { get; set; } - - /// - /// NDArray的数据类型 - /// - public Type DType => Values.dtype; - - /// - /// 维度 - /// - public int NDIM => Values.ndim; - - /// - /// - /// - public Shape Shape => Values.Storage.Shape; - - /// - /// 元素总数 - /// - public int Size => Values.size; - } -} diff --git a/src/Pandas.NET/Series/MaskedSeries.cs b/src/Pandas.NET/Series/MaskedSeries.cs new file mode 100644 index 0000000..e68bf00 --- /dev/null +++ b/src/Pandas.NET/Series/MaskedSeries.cs @@ -0,0 +1,21 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace PandasNet +{ + public class MaskedSeries : Series + { + private Series _mask; + public MaskedSeries(Array data, Series index, Column column) : + base(data, index, column) + { + + } + + public void SetMask(Series mask) + { + _mask = mask; + } + } +} diff --git a/src/Pandas.NET/Series/Series.Implicit.cs b/src/Pandas.NET/Series/Series.Implicit.cs new file mode 100644 index 0000000..3221af1 --- /dev/null +++ b/src/Pandas.NET/Series/Series.Implicit.cs @@ -0,0 +1,9 @@ +using Tensorflow.NumPy; + +namespace PandasNet; + +public partial class Series +{ + public static implicit operator NDArray(Series series) + => series.to_numpy(); +} diff --git a/src/Pandas.NET/Series/Series.Index.cs b/src/Pandas.NET/Series/Series.Index.cs new file mode 100644 index 0000000..4225ef3 --- /dev/null +++ b/src/Pandas.NET/Series/Series.Index.cs @@ -0,0 +1,51 @@ +using System; +using System.Linq; + +namespace PandasNet +{ + public partial class Series + { + public object this[string index] + { + get + { + var idx = Array.IndexOf(_index.data, index); + return _data.GetValue(idx); + } + + set + { + var idx = Array.IndexOf(_index.data, index); + _data.SetValue(value, idx); + } + } + + public object this[Series mask] + { + get + { + if(mask.data is bool[]) + { + var ms = new MaskedSeries(_data, _index, _column); + ms.SetMask(mask); + return ms; + } + throw new NotImplementedException(""); + } + + set + { + if (mask.data is bool[] masks) + { + for (int row = 0; row < masks.Length; row++) + { + if (masks[row]) + _data.SetValue(value, row); + } + return; + } + throw new NotImplementedException(""); + } + } + } +} diff --git a/src/Pandas.NET/Series/Series.Quantile.cs b/src/Pandas.NET/Series/Series.Quantile.cs new file mode 100644 index 0000000..9ae9cc6 --- /dev/null +++ b/src/Pandas.NET/Series/Series.Quantile.cs @@ -0,0 +1,25 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using MathNet.Numerics.Statistics; + +namespace PandasNet; +public partial class Series +{ + public double Quantile(double q) + { + IList data = new List(); + foreach (var item in _data) + { + data.Add((double)Convert.ChangeType(item, typeof(double))); + } + + // in testing I have found that the R7 interpolation method is the most accurate when comparing the results to pandas in python. + // https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm + // R7 is the default method used in R. + return Statistics.QuantileCustom(data, q, QuantileDefinition.R7); + } + +} \ No newline at end of file diff --git a/src/Pandas.NET/Series/Series.array.cs b/src/Pandas.NET/Series/Series.array.cs new file mode 100644 index 0000000..3c0c5aa --- /dev/null +++ b/src/Pandas.NET/Series/Series.array.cs @@ -0,0 +1,23 @@ +namespace PandasNet; + +using System; +using System.Linq; + +public partial class Series +{ + public T[] array() + { + if (typeof(T) == typeof(float)) + { + if (dtype == typeof(string)) + { + return (_data as string[]).Select(x => float.Parse(x)).ToArray() as T[]; + } + else if (dtype == typeof(int)) + { + return (_data as int[]).Select(x => Convert.ToSingle(x)).ToArray() as T[]; + } + } + return _data as T[]; + } +} diff --git a/src/Pandas.NET/Series/Series.copy.cs b/src/Pandas.NET/Series/Series.copy.cs new file mode 100644 index 0000000..7eeeb27 --- /dev/null +++ b/src/Pandas.NET/Series/Series.copy.cs @@ -0,0 +1,22 @@ +using System; + +namespace PandasNet; + +public partial class Series +{ + public Series copy() + { + var data = _data switch + { + bool[] => Copy(array()), + int[] => Copy(array()), + float[] => Copy(array()), + double[] => Copy(array()), + DateTime[] => Copy(array()), + string[] => Copy(array()), + _ => throw new NotImplementedException("") + }; + + return new Series(data, column: _column, index: _index?.copy()); + } +} diff --git a/src/Pandas.NET/Series/Series.cs b/src/Pandas.NET/Series/Series.cs new file mode 100644 index 0000000..447142f --- /dev/null +++ b/src/Pandas.NET/Series/Series.cs @@ -0,0 +1,128 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace PandasNet +{ + public partial class Series + { + Array _data; + public Array data => _data; + Series _index; + public Series index => _index; + Column _column; + public Column column => _column; + + public string name => _column.Name; + public int size => _data.Length; + public Type dtype => _column.DType; + private List naIndex = new List(); + + public Series(Array data) + { + _data = data; + _column = data switch + { + bool[] bool1 => new Column { Name = string.Empty, DType = typeof(bool) }, + int[] int32 => new Column { Name = string.Empty, DType = typeof(int) }, + float[] float32 => new Column { Name = string.Empty, DType = typeof(float) }, + double[] float64 => new Column { Name = string.Empty, DType = typeof(double) }, + DateTime[] strings => new Column { Name = string.Empty, DType = typeof(DateTime) }, + string[] strings => new Column { Name = string.Empty, DType = typeof(string) }, + _ => throw new NotImplementedException("") + }; + } + + public Series(Column column) + { + _column = column; + } + + public Series(Array data, Column column) + { + _data = data; + _column = column; + } + + public Series(Array data, Series index, Column column) + { + _data = data; + _index = index; + _column = column; + } + + public void Allocate(int count) + { + if (_column.DType == typeof(int)) + _data = new int[count]; + else if (_column.DType == typeof(float)) + _data = new float[count]; + else if (_column.DType == typeof(double)) + _data = new double[count]; + else if (_column.DType == typeof(DateTime)) + _data = new DateTime[count]; + else if (_column.DType == typeof(string)) + _data = new string[count]; + else + throw new NotImplementedException(""); + } + + public void SetIndex(Series index) + { + _index = index; + } + + public object GetValue(int row) + { + return _data.GetValue(row); + } + + public T GetValue(int row) + { + return (T)_data.GetValue(row); + } + + public void SetNull(int row) + { + naIndex.Add(row); + } + + public bool IsNull(int row) + { + return naIndex.Contains(row); + } + + public void SetValue(T value, int row) + { + if (dtype == typeof(int) && value is string int32_string) + { + int.TryParse(int32_string, out var int32); + _data.SetValue(int32, row); + } + else if (dtype == typeof(float) && value is string float32_string) + { + float.TryParse(float32_string, out var float32); + _data.SetValue(float32, row); + } + else if (dtype == typeof(double) && value is string float64_string) + { + double.TryParse(float64_string, out var float64); + _data.SetValue(float64, row); + } + else if (dtype == typeof(DateTime) && value is string dt_string) + { + DateTime.TryParse(dt_string, out var datetime); + _data.SetValue(datetime, row); + } + else + { + _data.SetValue(value, row); + } + } + + public override string ToString() + { + return $"{name}, {size}, {dtype}"; + } + } +} diff --git a/src/Pandas.NET/Series/Series.drop.cs b/src/Pandas.NET/Series/Series.drop.cs new file mode 100644 index 0000000..9fe8c2f --- /dev/null +++ b/src/Pandas.NET/Series/Series.drop.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PandasNet; + +public partial class Series +{ + public Series drop(int[] index) + { + var data = _data switch + { + bool[] => Copy(array(), index), + int[] => Copy(array(), index), + float[] => Copy(array(), index), + double[] => Copy(array(), index), + DateTime[] => Copy(array(), index), + string[] => Copy(array(), index), + _ => throw new NotImplementedException("") + }; + + if (_index == null) + { + throw new NotSupportedException("_index was null for this series."); + } + if (index == null) + { + throw new NotSupportedException("array of indexes to drop cannot be null"); + } + if (_index.array() == null) + { + throw new NotSupportedException("_index.array() was null for this series."); + } + var index2 = _index.array().Where(x => !index.Contains(x)).ToArray(); + return new Series(data, column: _column, index: new Series(index2)); + } + + private Array Copy(T[] array, int[] excluded = null) + { + var data = new List(); + for (int i = 0; i < array.Length; i++) + { + if (excluded != null && excluded.Contains(i)) + continue; + data.Add(array[i]); + } + return data.ToArray(); + } +} diff --git a/src/Pandas.NET/Series/Series.map.cs b/src/Pandas.NET/Series/Series.map.cs new file mode 100644 index 0000000..23632e2 --- /dev/null +++ b/src/Pandas.NET/Series/Series.map.cs @@ -0,0 +1,16 @@ +using System; + +namespace PandasNet; + +public partial class Series +{ + public Series map(Func func) + { + var data = new Tout[size]; + for (int i = 0; i < size; i++) + { + data[i] = func((Tin)_data.GetValue(i)); + } + return new Series(data, column); + } +} diff --git a/src/Pandas.NET/Series/Series.math.cs b/src/Pandas.NET/Series/Series.math.cs new file mode 100644 index 0000000..7496022 --- /dev/null +++ b/src/Pandas.NET/Series/Series.math.cs @@ -0,0 +1,96 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PandasNet +{ + public partial class Series + { + public int count() + => _data.Length; + + public double mean() + => sum() / count(); + + public double sum() => _data switch + { + int[] data => data.Sum(x => (double)x), + float[] data => data.Sum(), + double[] data => data.Sum(), + _ => throw new NotImplementedException("") + }; + + public double min() => _data switch + { + int[] data => data.Min(), + float[] data => data.Min(), + double[] data => data.Min(), + _ => throw new NotImplementedException($"typeof {_data.GetType()} is not supported") + }; + + public double max() => _data switch + { + int[] data => data.Max(), + float[] data => data.Max(), + double[] data => data.Max(), + _ => throw new NotImplementedException("") + }; + + public double std() + { + return _data switch + { + int[] data => SampleStandardDeviation(data), + float[] data => SampleStandardDeviation(data), + double[] data => SampleStandardDeviation(data), + _ => throw new NotImplementedException("") + }; + } + + private double PopulationStandardDeviation(double[] data) + { + var avg = data.Average(); + var variance = data.Average(v => Math.Pow(v - avg, 2)); + return Math.Sqrt(variance); + } + private double PopulationStandardDeviation(float[] data) => PopulationStandardDeviation(data.Select(x => (double)x).ToArray()); + private double PopulationStandardDeviation(int[] data) => PopulationStandardDeviation(data.Select(x => (double)x).ToArray()); + + + private double SampleStandardDeviation(double[] data) + { + var avg = data.Average(); + var variance = data.Average(v => Math.Pow(v - avg, 2)); + return Math.Sqrt(variance * data.Length / (data.Length - 1)); + } + private double SampleStandardDeviation(float[] data) => SampleStandardDeviation(data.Select(x => (double)x).ToArray()); + private double SampleStandardDeviation(int[] data) => SampleStandardDeviation(data.Select(x => (double)x).ToArray()); + + + public Series cos() + { + var cos = _data switch + { + int[] data => data.Select(x => (float)Math.Cos(x)), + float[] data => data.Select(x => (float)Math.Cos(x)), + double[] data => data.Select(x => (float)Math.Cos(x)), + _ => throw new NotImplementedException("") + }; + + return new Series(cos.ToArray()); + } + + public Series sin() + { + var sin = _data switch + { + int[] data => data.Select(x => (float)Math.Sin(x)), + float[] data => data.Select(x => (float)Math.Sin(x)), + double[] data => data.Select(x => (float)Math.Sin(x)), + _ => throw new NotImplementedException("") + }; + + return new Series(sin.ToArray()); + } + } +} diff --git a/src/Pandas.NET/Series/Series.operator.cs b/src/Pandas.NET/Series/Series.operator.cs new file mode 100644 index 0000000..9e56c4b --- /dev/null +++ b/src/Pandas.NET/Series/Series.operator.cs @@ -0,0 +1,125 @@ +using System; +using System.Linq; + +namespace PandasNet +{ + public partial class Series + { + public static Series operator !=(Series a, double b) + { + if (a.data is int[] || a.data is float[] || a.data is double[]) + { + return new Series(a.data.Cast().Select(x => x != b).ToArray() + , column: a._column + , index: a._index?.copy()); + } + + throw new NotImplementedException(""); + } + + public static Series operator ==(Series a, double b) + { + if (a.data is int[] || a.data is float[] || a.data is double[]) + { + return new Series(a.data.Cast().Select(x => x == b).ToArray() + , column: a._column + , index: a._index?.copy()); + } + + throw new NotImplementedException(""); + } + + public static Series operator *(Series a, Series b) + { + if (a.data is float[] float32a && b.data is float[] float32b) + { + var data = new float[a.index.size]; + for (var i = 0; i < data.Length; i++) + data[i] = float32a[i] * float32b[i]; + return new Series(data); + } + else if (a.data is double[] float64a && b.data is double[] float64b) + { + var data = new double[a.index.size]; + for (var i = 0; i < data.Length; i++) + data[i] = float64a[i] * float64b[i]; + return new Series(data); + } + throw new NotImplementedException(""); + } + + public static Series operator +(Series a, double b) + { + if (a.data is int[] || a.data is float[] || a.data is double[]) + { + return new Series(a.data.Cast().Select(x => x + b).ToArray() + , column: a._column + , index: a._index?.copy()); + } + + throw new NotImplementedException(""); + } + + public static Series operator -(Series a, double b) + { + if (a.data is int[] || a.data is float[] || a.data is double[]) + { + return new Series(a.data.Cast().Select(x => x - b).ToArray() + , column: a._column + , index: a._index?.copy()); + } + + throw new NotImplementedException(""); + } + + public static Series operator *(Series a, double b) + { + if (a.data is int[] || a.data is float[] || a.data is double[]) + { + return new Series(a.data.Cast().Select(x => x * b).ToArray() + , column: a._column + , index: a._index?.copy()); + } + + throw new NotImplementedException(""); + } + + public static Series operator /(Series a, double b) + { + if (a.data is int[] || a.data is float[] || a.data is double[]) + { + return new Series(a.data.Cast().Select(x => x / b).ToArray() + , column: a._column + , index: a._index?.copy()); + } + + throw new NotImplementedException(""); + } + + public override bool Equals(object obj) + { + if (obj is Series series) + { + if (series.data is double[] double64) + { + return data.Cast().SequenceEqual(double64); + } + else if (series.data is float[] float32) + { + return data.Cast().SequenceEqual(float32); + } + else if (series.data is int[] int32) + { + return data.Cast().SequenceEqual(int32); + } + } + + return false; + } + + public override int GetHashCode() + { + return base.GetHashCode(); + } + } +} diff --git a/src/Pandas.NET/Series/Series.q1.cs b/src/Pandas.NET/Series/Series.q1.cs new file mode 100644 index 0000000..19d53b9 --- /dev/null +++ b/src/Pandas.NET/Series/Series.q1.cs @@ -0,0 +1,18 @@ +using System; + +namespace PandasNet; + +public partial class Series +{ + public double q1() + { + double p = 0.25; + return _data switch + { + int[] data => Quantile(p), + float[] data => Quantile(p), + double[] data => Quantile(p), + _ => throw new NotImplementedException("") + }; + } +} \ No newline at end of file diff --git a/src/Pandas.NET/Series/Series.q2.cs b/src/Pandas.NET/Series/Series.q2.cs new file mode 100644 index 0000000..016fa6f --- /dev/null +++ b/src/Pandas.NET/Series/Series.q2.cs @@ -0,0 +1,18 @@ +using System; + +namespace PandasNet; + +public partial class Series +{ + public double q2() + { + double p = 0.5; + return _data switch + { + int[] data => Quantile(p), + float[] data => Quantile(p), + double[] data => Quantile(p), + _ => throw new NotImplementedException("") + }; + } +} \ No newline at end of file diff --git a/src/Pandas.NET/Series/Series.q3.cs b/src/Pandas.NET/Series/Series.q3.cs new file mode 100644 index 0000000..3b526cd --- /dev/null +++ b/src/Pandas.NET/Series/Series.q3.cs @@ -0,0 +1,18 @@ +using System; + +namespace PandasNet; + +public partial class Series +{ + public double q3() + { + double p = 0.75; + return _data switch + { + int[] data => Quantile(p), + float[] data => Quantile(p), + double[] data => Quantile(p), + _ => throw new NotImplementedException("") + }; + } +} \ No newline at end of file diff --git a/src/Pandas.NET/Series/Series.to_numpy.cs b/src/Pandas.NET/Series/Series.to_numpy.cs new file mode 100644 index 0000000..d74fd0f --- /dev/null +++ b/src/Pandas.NET/Series/Series.to_numpy.cs @@ -0,0 +1,19 @@ +using System; +using Tensorflow.NumPy; + +namespace PandasNet; + +public partial class Series +{ + public NDArray to_numpy() + { + return _data switch + { + bool[] => np.array(array()), + int[] => np.array(array()), + float[] => np.array(array()), + double[] => np.array(array()), + _ => throw new NotImplementedException("") + }; + } +} diff --git a/src/Pandas.NET/SeriesBase.cs b/src/Pandas.NET/SeriesBase.cs deleted file mode 100644 index e2d14cb..0000000 --- a/src/Pandas.NET/SeriesBase.cs +++ /dev/null @@ -1,129 +0,0 @@ -using NumSharp.Core; -using PandasNet.Impl; -using PandasNet.Iteration; -using System; -using System.Collections.Generic; -using System.Text; - -namespace PandasNet -{ - public abstract class SeriesBase : PandasObject, IPandasObject, IRowIndexable - { - public IDataIndex Index { get; set; } - - public IDataFrame this[Slice s] => throw new NotImplementedException(); - - - - /// - /// 转换为指定的dtype - /// - /// - /// - /// - public abstract SeriesBase AsType(bool copy = true); - - public abstract object this[int index] { get; set; } - - public abstract object this[string idx] { get; } - - public static NDArray operator +(SeriesBase a, SeriesBase b) - { - if (a.Shape != b.Shape) - { - throw new Exception("相加的Series长度Shape不相等"); - } - NDArray nd = new NDArray(typeof(object), a.Shape); - - for (var i = 0; i < a.Size; i++) - { - if (a[i] is string || b[i] is string) - { - nd[i] = a[i].ToString() + b[i].ToString(); - } - else - { - try - { - nd[i] = Convert.ToDecimal(a[i]) + Convert.ToDecimal(b[i]); - } - catch (InvalidCastException) - { - nd[i] = null; - } - } - } - return nd; - } - - public static NDArray operator -(SeriesBase a, SeriesBase b) - { - if (a.Shape != b.Shape) - { - throw new Exception("相加的Series长度Shape不相等"); - } - NDArray nd = new NDArray(typeof(object), a.Shape); - for (var i = 0; i < a.Size; i++) - { - if (a[i] is string || b[i] is string) - { - nd[i] = null; - } - else - { - try - { - nd[i] = Convert.ToDecimal(a[i]) - Convert.ToDecimal(b[i]); - } - catch (InvalidCastException) - { - nd[i] = null; - } - } - } - return nd; - } - - public static NDArray operator *(SeriesBase a, SeriesBase b) - { - if (a.Shape != b.Shape) - { - throw new Exception("相加的Series长度Shape不相等"); - } - NDArray nd = new NDArray(typeof(object), a.Shape); - for (var i = 0; i < a.Size; i++) - { - try - { - nd[i] = Convert.ToDecimal(a[i]) * Convert.ToDecimal(b[i]); - } - catch (InvalidCastException) - { - nd[i] = null; - } - } - return nd; - } - - public static NDArray operator /(SeriesBase a, SeriesBase b) - { - if (a.Shape != b.Shape) - { - throw new Exception("相加的Series长度Shape不相等"); - } - NDArray nd = new NDArray(typeof(object), a.Shape); - for (var i = 0; i < a.Size; i++) - { - try - { - nd[i] = Convert.ToDecimal(a[i]) / Convert.ToDecimal(b[i]); - } - catch (DivideByZeroException) - { - nd[i] = null; - } - } - return nd; - } - } -} diff --git a/src/Pandas.NET/Slice.cs b/src/Pandas.NET/Slice.cs deleted file mode 100644 index 6f15852..0000000 --- a/src/Pandas.NET/Slice.cs +++ /dev/null @@ -1,54 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; -using System.Text.RegularExpressions; - -namespace PandasNet -{ - public class Slice - { - public int Start { private set; get; } - - public int End { private set; get; } - - public int Step { private set; get; } - - /// - /// - /// - /// - public static implicit operator Slice(string sliceRule) - { - return new Slice(sliceRule); - } - - Slice(string sliceRule) - { - if (string.IsNullOrEmpty(sliceRule)) - { - throw new ArgumentNullException("the sliceRule is null or empty"); - } - Regex regex = new Regex(@"(?\d*):(?\d*):?(?\d*)"); - Match match = regex.Match(sliceRule); - - Start = 0; - End = int.MaxValue; - Step = 1; - if (match.Success) - { - var start = match.Groups["start"].Value; - var end = match.Groups["end"].Value; - var step = match.Groups["step"].Value; - - Start = !string.IsNullOrEmpty(start) ? Convert.ToInt32(start) : Start; - End = !string.IsNullOrEmpty(end) ? Convert.ToInt32(end) : End; - Step = !string.IsNullOrEmpty(step) ? Convert.ToInt32(step) : Step; - } - else - { - throw new ArgumentException("the sliceRule is not correct format"); - } - - } - } -} diff --git a/src/Pandas.NET/Utils/Web.cs b/src/Pandas.NET/Utils/Web.cs new file mode 100644 index 0000000..f32cfca --- /dev/null +++ b/src/Pandas.NET/Utils/Web.cs @@ -0,0 +1,40 @@ +using System; +using System.IO; +using System.Linq; +using System.Net; +using System.Threading; +using System.Threading.Tasks; + +namespace PandasNet.Utils; + +public class Web +{ + public static bool Download(string url, string destDir, string destFileName) + { + if (destFileName == null) + destFileName = url.Split(Path.DirectorySeparatorChar).Last(); + + Directory.CreateDirectory(destDir); + + string relativeFilePath = Path.Combine(destDir, destFileName); + + if (File.Exists(relativeFilePath)) + { + Console.WriteLine($"{relativeFilePath} already exists."); + return false; + } + + var wc = new WebClient(); + Console.WriteLine($"Downloading from {url}"); + var download = Task.Run(() => wc.DownloadFile(url, relativeFilePath)); + while (!download.IsCompleted) + { + Thread.Sleep(1000); + Console.WriteLine("."); + } + Console.WriteLine(""); + Console.WriteLine($"Downloaded to {relativeFilePath}"); + + return true; + } +} diff --git a/test/Pandas.NET.Test/DataFrameTest.cs b/test/Pandas.NET.Test/DataFrameTest.cs deleted file mode 100644 index 11db14e..0000000 --- a/test/Pandas.NET.Test/DataFrameTest.cs +++ /dev/null @@ -1,122 +0,0 @@ -using NumSharp.Core; -using PandasNet.Impl; -using System; -using System.Collections.Generic; -using System.Text; -using Xunit; - -namespace PandasNet.Test -{ - public class DataFrameTest - { - [Fact] - public void Create_WithNDArray_Test() - { - NDArray array = np.arange(10); - array.reshape(5, 2); - var pd = new Pandas(); - IDataFrame df1 = new DataFrame(array, null, null, typeof(object)); - var one = df1[0]; - Assert.Equal(0, (one as SeriesBase).Name); - Assert.Equal(4, (one as SeriesBase)[2]); - var oneAndTwo = df1[0,1]; - var s = oneAndTwo.iloc[4]; - Assert.Equal(8, s[0]); - Assert.Equal(9, s[1]); - } - - [Fact] - public void Create_WithDict_Test() - { - var dict = new Dictionary - { - { "one", np.arange(10000) }, - { "two", np.arange(10001, 20001) } - }; - var pd = new Pandas(); - IDataFrame df1 = new DataFrame(dict); - var one = df1["one"]; - Assert.Equal("one", (one as SeriesBase).Name); - Assert.Equal(2, (one as SeriesBase)[2]); - var oneAndTwo = df1["one", "two"]; - - var s = oneAndTwo.iloc[9999]; - Assert.Equal(9999, s[0]); - Assert.Equal(9999, s["one"]); - Assert.Equal(20000, s[1]); - Assert.Equal(20000, s["two"]); - } - - [Fact] - public void SetColumn_Test() - { - var dict = new Dictionary - { - { "one", np.arange(1000) }, - { "two", np.arange(1001, 2001) } - }; - var pd = new Pandas(); - var df1 = pd.DataFrame(dict); - df1["three"] = new Series(np.arange(2001, 3001)); - Assert.Equal(3000, df1.Size); - df1.SingleColumn("four", 1); - Assert.Equal(1, df1["four"][500]); - Assert.Equal(4000, df1.Size); - - df1.SingleColumn(1, 1); - Assert.Equal(1, df1["two"][500]); - df1.Column("five", np.arange(3001, 4001)); - Assert.Equal(3001, df1["five"][0]); - - df1.SingleColumn(4, 1); - Assert.Equal(1, df1["five"][0]); - - Assert.Equal(3000, df1["three"][999]); - df1["three"] = new Series(np.arange(1000)); - Assert.Equal(999, df1["three"][999]); - - } - - [Fact] - public void Read_iloc_Test() - { - var dict = new Dictionary - { - { "one", np.arange(1000) }, - { "two", np.arange(1001, 2001) } - }; - var pd = new Pandas(); - IDataFrame df1 = new DataFrame(dict); - Assert.Equal(2, df1.iloc[2].Name); - Assert.Equal(2, df1.iloc[2]["one"]); - } - - [Fact] - public void Slice_Row_Test() - { - var dict = new Dictionary - { - { "one", np.arange(100) }, - { "two", np.arange(101, 201) } - }; - var pd = new Pandas(); - IDataFrame df1 = new DataFrame(dict); - var dfSl1 = df1[(Slice)"0:12:5"]; - var one = dfSl1["one"]; - Assert.Equal("one", (one as SeriesBase).Name); - Assert.Equal(10, (one as SeriesBase)[2]); - var dfSl2 = df1[(Slice)"0:10"]; - var two = dfSl2["two"]; - Assert.Equal("two", (two as SeriesBase).Name); - Assert.Equal(110, (two as SeriesBase)[9]); - var dfSl3 = df1[(Slice)"6:10:3"]; - var two2 = dfSl3["two"]; - Assert.Equal("two", (two2 as SeriesBase).Name); - Assert.Equal(110, (two2 as SeriesBase)[1]); - var dfSl4 = df1[(Slice)":"]; - var one2 = dfSl4["one"]; - Assert.Equal("one", (one2 as SeriesBase).Name); - Assert.Equal(0, (one2 as SeriesBase)[0]); - } - } -} diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.copy.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.copy.cs new file mode 100644 index 0000000..6097aba --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.copy.cs @@ -0,0 +1,32 @@ +using Tensorflow; +using static PandasNet.PandasApi; + +namespace Pandas.Test +{ + public class DataFrameCopyTest + { + [Fact] + public void TestCopy() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}"); + + // Act + var copiedDf = df.copy(); + + // Assert + // Verify that the copied DataFrame is not the same instance as the original DataFrame + Assert.NotSame(df, copiedDf); + + // Verify that the columns weren't just copied by reference + Assert.NotSame(df.columns[0], copiedDf.columns[0]); + + // Verify that the copied DataFrame has the same data as the original DataFrame + // same count of columns + Assert.Equal(df.columns.Count, copiedDf.columns.Count); + + // same count of rows + Assert.Equal(df[0].data.Count, copiedDf[0].data.Count); + } + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.describe.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.describe.cs new file mode 100644 index 0000000..7ff056c --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.describe.cs @@ -0,0 +1,66 @@ +using System.Collections.Generic; +using System.Security.Cryptography; +using System.Text.Json; +using System.Text.Json.Nodes; +using PandasNet; +using static PandasNet.PandasApi; + +namespace Pandas.Test +{ + public class DataFrameDescribeTest + { + [Fact] + public void TestDescribe1() + { + var df = pd.DataFrame.from_dict(JsonSerializer.Serialize(new Dictionary + { + {"col_1",new int[] { 1, 2, 3 }}, + {"col_2", new int[] { 4, 5, 6 }}, + {"col_3 ", new int[] { 7, 8, 9 }} + })); + + // Act + var result = df.describe(); + + // Assert + Assert.Equal(3, result.columns.Count); // Expecting 3 statistical columns + Assert.Equal(8, result.shape[0]); // Expecting 8 statistical rows + + // Assert specific values for "count", "mean", "std", "min", "25%", "50%", "75%", "max" + Assert.Equal(3, result.data[0].GetValue(0)); // count + Assert.Equal(2, result.data[0].GetValue(1)); // mean + Assert.Equal(1, result.data[0].GetValue(2)); // std + Assert.Equal(1, result.data[0].GetValue(3)); // min + Assert.Equal(1.5, result.data[0].GetValue(4)); // 25% + Assert.Equal(2, result.data[0].GetValue(5)); // 50% + Assert.Equal(2.5, result.data[0].GetValue(6)); // 75% + Assert.Equal(3, result.data[0].GetValue(7)); // max + } + + [Fact] + public void TestDescribe2() + { + var df = pd.DataFrame.from_dict(JsonSerializer.Serialize(new Dictionary + { + {"col_1",new int[] { 1, 2, 3, 4, 5 }} + })); + + // Act + var result = df.describe(); + + // Assert + Assert.Single(result.columns); // Expecting 1 statistical columns + Assert.Equal(8, result.shape[0]); // Expecting 8 statistical rows + + // Assert specific values for "count", "mean", "std", "min", "25%", "50%", "75%", "max" + Assert.Equal(5, result.data[0].GetValue(0)); // count + Assert.Equal(3, result.data[0].GetValue(1)); // mean + Assert.Equal(1.5811388300841898, result.data[0].GetValue(2)); // std + Assert.Equal(1, result.data[0].GetValue(3)); // min + Assert.Equal(2, result.data[0].GetValue(4)); // 25% + Assert.Equal(3, result.data[0].GetValue(5)); // 50% + Assert.Equal(4, result.data[0].GetValue(6)); // 75% + Assert.Equal(5, result.data[0].GetValue(7)); // max + } + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.drop.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.drop.cs new file mode 100644 index 0000000..44d1032 --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.drop.cs @@ -0,0 +1,32 @@ +using Xunit; +using static PandasNet.PandasApi; + +namespace Pandas.Test +{ + public class DataFrameDropTest + { + [Fact] + public void TestDrop() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}"); + var indexToDrop = new[] { 1, 3 }; + + // Act + var result = df.drop(indexToDrop); + + // Assert + // Verify that the dropped rows are not present in the result + Assert.DoesNotContain(2, result["col_1"].array()); + Assert.DoesNotContain(0, result["col_1"].array()); + Assert.DoesNotContain("b", result["col_2"].array()); + Assert.DoesNotContain("d", result["col_2"].array()); + + // Verify that the remaining rows are present in the result + Assert.Contains(3, result["col_1"].array()); + Assert.Contains(1, result["col_1"].array()); + Assert.Contains("a", result["col_2"].array()); + Assert.Contains("c", result["col_2"].array()); + } + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.dropna.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.dropna.cs new file mode 100644 index 0000000..22e6239 --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.dropna.cs @@ -0,0 +1,30 @@ +using System; +using PandasNet; +using Xunit; +using static PandasNet.PandasApi; + +namespace Pandas.Test +{ + public class DataFrameDropnaTest + { + [Fact] + public void TestDropna() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}"); + Assert.Equal(4, df.shape[0]); + Assert.Equal(2, df.shape[1]); + df["col_1"].SetNull(1); + df["col_2"].SetNull(2); + + // Act + var result = df.dropna(); + + // Assert + Assert.Equal(2, result.shape[0]); //rows + Assert.Equal(2, result.shape[1]); //columns + Assert.Equal(new int[] { 3, 0 }, result["col_1"].array()); + Assert.Equal(new string[] { "a", "d" }, result["col_2"].array()); + } + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.head.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.head.cs new file mode 100644 index 0000000..c788257 --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.head.cs @@ -0,0 +1,49 @@ +using static PandasNet.PandasApi; +using Tensorflow; +using System.Linq; + +namespace Pandas.Test.DataFrames +{ + public class DataFrameHeadTest + { + [Fact] + public void TestHead() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [5,4,3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd','e','f']}"); + var expected = df[new Slice(0, 5, 1)]; + + // Act + var actual = df.head(); + + // Assert + Assert.True(expected == actual); + } + + [Fact] + public void TestHead_ReturnsCorrectNumberOfRows() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}"); + + // Act + var result = df.head(3); + + // Assert + Assert.Equal(3, result.shape[0]); + } + + [Fact] + public void TestHead_ReturnsCorrectColumns() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}"); + + // Act + var result = df.head(3); + + // Assert + Assert.Equal(new[] { "col_1", "col_2" }, result.columns.Select(c => c.Name).ToArray()); + } + } +} diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.implicit.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.implicit.cs new file mode 100644 index 0000000..4e62b5a --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.implicit.cs @@ -0,0 +1,36 @@ +using System.Linq; +using Tensorflow.NumPy; +using static PandasNet.PandasApi; + +namespace Pandas.Test +{ + public class DataFrameImplicitTest + { + /// + /// Test implicit conversion of DataFrame to NDArray. + /// + /// See related StackOverflow answer if you experience issues with TensorFlow DLL. + /// > + /// + [Fact] + public void ImplicitConversionToNDArray() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [5, 4, 3, 2, 1, 0]}"); + + // Act + + NDArray result = df; + + // Assert + Assert.NotNull(result); + + // Add more assertions as needed + Assert.Equal(6, result.shape[0]); + Assert.Equal(1, result.shape[1]); + + // ensure they are the same element-wise + Assert.True(df.data[0].array().SequenceEqual([.. result[":", 0]])); + } + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.index.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.index.cs new file mode 100644 index 0000000..6fa2a9d --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.index.cs @@ -0,0 +1,83 @@ +using System.Linq; +using Tensorflow.NumPy; +using static PandasNet.PandasApi; +using Tensorflow; +using PandasNet; +using System.Collections.Generic; +using System.Text.Json; + +namespace Pandas.Test +{ + public class DataFrameIndexTest + { + [Fact] + public void IndexTest1_Slice() + { + //Arrange + var df = pd.DataFrame.from_dict("{'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}"); + + //Act + var slicedDf = df[new Slice(1, 4, 1)]; + + //Assert + Assert.Equal(3, slicedDf.shape[0]); + Assert.Equal(2, slicedDf.shape[1]); + + Assert.Equal(2, slicedDf["col_1"].GetValue(0)); + Assert.Equal("c", slicedDf["col_2"].GetValue(1)); + } + + [Fact] + public void TestIndexer() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}"); + + + // Act + var indexedInt = df[0, "col_1"]; + var indexedString = df[1, "col_2"]; + + // Assert + Assert.Equal(df["col_1"].GetValue(0), (int)indexedInt); + Assert.Equal(df["col_2"].GetValue(1), (string)indexedString); + } + + [Fact] + public void TestReturnsCorrectSeries() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0]}"); + + // Act + Series col2 = new Series(new int[] { 1, 2, 3, 4 }, new Column { Name = "col_2", DType = typeof(int) }); + df.data.Add(col2); + + // Assert + Assert.Equal(col2, df["col_2"]); + Assert.True(col2.array().SequenceEqual(df["col_2"].array())); + } + + [Fact] + public void TestMultiColumnIndexer() + { + // Arrange + var df = pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd'], 'col_3': [1, 2, 3, 4]}"); + + Assert.Equal(3, df.shape[1]); + + // Act + var indexedDf = df["col_1", "col_2"]; + + // Assert + + + Assert.Equal(2, indexedDf.shape[1]); + Assert.Equal(4, indexedDf.shape[0]); + + Assert.Equal(3, indexedDf["col_1"].GetValue(0)); + Assert.Equal("b", indexedDf["col_2"].GetValue(1)); + } + + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.math.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.math.cs new file mode 100644 index 0000000..684745d --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.math.cs @@ -0,0 +1,52 @@ +using System.Linq; +using Tensorflow.NumPy; +using static PandasNet.PandasApi; +using Tensorflow; +using PandasNet; +using System.Collections.Generic; +using System.Text.Json; + +namespace Pandas.Test +{ + public class DataFrameMathTest + { + [Fact] + public void TestMean() + { + // Arrange + var dfOdd = pd.DataFrame.from_dict("{'col_1': [1, 2, 3, 4, 5], 'col_2': [6, 7, 8, 9, 10]}"); + var dfEven = pd.DataFrame.from_dict("{'col_1': [1, 2, 3, 4], 'col_2': [5, 6, 7, 8]}"); + + // Act + Series meanOdd = dfOdd.mean(); + Series meanEven = dfEven.mean(); + + // Assert + Assert.Equal(3.0D, (double)meanOdd["col_1"]); + Assert.Equal(8.0D, (double)meanOdd["col_2"]); + + Assert.Equal(2.5D, (double)meanEven["col_1"]); + Assert.Equal(6.5D, (double)meanEven["col_2"]); + } + + [Fact] + public void TestStd() + { + // Arrange + List data = new List + { + new Series(new double[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(double) }), + new Series(new double[] { 2, 4, 6, 8, 10 }, new Column { Name = "column2", DType = typeof(double) }) + }; + var dataFrame = new DataFrame(data); + + // Act + Series result = dataFrame.std(); + + // Assert + // Expected values are calculated using numpy.std + Assert.Equal(1.5811388300841898, result["column1"]); + Assert.Equal(3.1622776601683795, result["column2"]); + } + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.operator.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.operator.cs new file mode 100644 index 0000000..04e309d --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.operator.cs @@ -0,0 +1,145 @@ +using System; +using System.Collections.Generic; +using HDF5CSharp; +using PandasNet; + + +namespace Pandas.Test; + +public class DataFrameOperatorTests +{ + [Fact] + public void TestSubtractionOperator() + { + // Arrange + List data = new List + { + new Series(new double[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(double) }), + new Series(new double[] { 2, 4, 6, 8, 10 }, new Column { Name = "column2", DType = typeof(double) }) + }; + var df = new DataFrame(data); + + var series = new Series(new double[] { 0.5, 1.5 }, new Series(new string[]{"column1","column2"}), new Column("sub", typeof(double))); + + // Act + var result = df - series; + + // Assert + Assert.Equal(new double[] { 0.5, 1.5, 2.5, 3.5, 4.5 }, result["column1"].data as double[]); + Assert.Equal(new double[] { 0.5, 2.5, 4.5, 6.5, 8.5}, result["column2"].data as double[]); + } + + [Fact] + public void TestDivisionOperator() + { + List data = new List + { + new Series(new double[] { 5, 10, 15, 20, 25 }, new Column { Name = "column1", DType = typeof(double) }), + new Series(new double[] { 2, 4, 6, 8, 10 }, new Column { Name = "column2", DType = typeof(double) }) + }; + var df = new DataFrame(data); + + var series = new Series(new double[] { 5, 2 }); + + // Act + var result = df / series; + + // Assert + Assert.Equal(new double[] { 1, 2, 3, 4, 5 }, result["column1"].data as double[]); + Assert.Equal(new double[] { 1, 2, 3, 4, 5 }, result["column2"].data as double[]); + } + + [Fact] + public void TestMultiplicationOperator() + { + List data = new List + { + new Series(new double[] { 5, 10, 15, 20, 25 }, new Column { Name = "column1", DType = typeof(double) }), + new Series(new double[] { 2, 4, 6, 8, 10 }, new Column { Name = "column2", DType = typeof(double) }) + }; + var df = new DataFrame(data); + + var series = new Series(new double[] { 1, 10 }); + + // Act + var result = df * series; + + // Assert + Assert.Equal(new double[] { 5, 10, 15, 20, 25 }, result["column1"].data as double[]); + Assert.Equal(new double[] { 20, 40, 60, 80, 100 }, result["column2"].data as double[]); + } + + [Fact] + public void TestEqualityOperator() + { + // Arrange + List data = new List + { + new Series(new float[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(float) }), + new Series(new float[] { 6, 7, 8, 9, 10 }, new Column { Name = "column2", DType = typeof(float) }) + }; + var df = new DataFrame(data); + var df2 = new DataFrame(data); + + // Arrange: DataFrame with different column name + List badColumnData = new List + { + new Series(new float[] { 1, 2, 3, 4, 5 }, new Column { Name = "Column1", DType = typeof(float) }), + new Series(new float[] { 6, 7, 8, 9, 10 }, new Column { Name = "column2", DType = typeof(float) }) + }; + var badColumnDf = new DataFrame(badColumnData); + // Arrange: DataFrame with different data + List badData = new List + { + new Series(new float[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(float) }), + new Series(new float[] { 6, 7, 8, 9, 11 }, new Column { Name = "column2", DType = typeof(float) }) + }; + + // Act + bool areEqual = df == df2; + bool badColumnInequal = df == badColumnDf; + bool badDataInequal = df == new DataFrame(badData); + + // Assert + Assert.True(areEqual); + Assert.False(badColumnInequal); + Assert.False(badDataInequal); + } + + [Fact] + public void TestInEqualityOperator() + { + // Arrange + List data = new List + { + new Series(new float[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(float) }), + new Series(new float[] { 6, 7, 8, 9, 10 }, new Column { Name = "column2", DType = typeof(float) }) + }; + var df = new DataFrame(data); + var df2 = new DataFrame(data); + + // Arrange: DataFrame with different column name + List badColumnData = new List + { + new Series(new float[] { 1, 2, 3, 4, 5 }, new Column { Name = "Column1", DType = typeof(float) }), + new Series(new float[] { 6, 7, 8, 9, 10 }, new Column { Name = "column2", DType = typeof(float) }) + }; + var badColumnDf = new DataFrame(badColumnData); + // Arrange: DataFrame with different data + List badData = new List + { + new Series(new float[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(float) }), + new Series(new float[] { 6, 7, 8, 9, 11 }, new Column { Name = "column2", DType = typeof(float) }) + }; + + // Act + bool areEqual = df != df2; + bool badColumnInequal = df != badColumnDf; + bool badDataInequal = df != new DataFrame(badData); + + // Assert + Assert.False(areEqual); + Assert.True(badColumnInequal); + Assert.True(badDataInequal); + } +} diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.pop.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.pop.cs new file mode 100644 index 0000000..34095fc --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.pop.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using PandasNet; +using Tensorflow; + +namespace Pandas.Test; + +public class DataFramePopTest +{ + [Fact] + public void TestPopMethod() + { + // Arrange + var dataFrameData = new List(); + dataFrameData.Add(new Series(new float[] { 1, 2, 3, 4, 5 }, new Column("column1", typeof(float)))); + dataFrameData.Add(new Series(new float[] { 6, 7, 8, 9, 10 }, new Column("column2", typeof(float)))); + var dataFrame = new DataFrame(dataFrameData); + + // Act + var poppedSeries = dataFrame.pop("column1"); + + // Assert + Assert.True(poppedSeries.array().SequenceEqual([1, 2, 3, 4, 5])); + Assert.False(dataFrame.columns.Where(c => c.Name == "column1").Any()); + Assert.True(dataFrame.columns.Where(c => c.Name == "column2").Any()); + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/DataFrames/DataFrame.test.sample.cs b/test/Pandas.NET.Test/DataFrames/DataFrame.test.sample.cs new file mode 100644 index 0000000..c4de814 --- /dev/null +++ b/test/Pandas.NET.Test/DataFrames/DataFrame.test.sample.cs @@ -0,0 +1,73 @@ +using PandasNet; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Pandas.Test; +public class DataFrameSampleTests +{ + [Fact] + public void TestSampleMethod() + { + // Arrange + List data = new List + { + new Series(new double[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(double) }), + new Series(new double[] {6, 7, 8, 9, 10 }, new Column { Name = "column2", DType = typeof(double) }) + }; + var dataFrame = new DataFrame(data); + + // Act + var sampledDataFrame = dataFrame.sample(n: 3, random_state: 1); + + // Assert + Assert.Equal(3, sampledDataFrame.index.size); + Assert.True(sampledDataFrame.columns.Where(x => x.Name == "column1").Any()); + Assert.True(sampledDataFrame.columns.Where(x => x.Name == "column2").Any()); + } + + [Fact] + public void TestSampleMethodWithFrac() + { + // Arrange + List data = new List + { + new Series(new double[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(double) }), + new Series(new double[] {6, 7, 8, 9, 10 }, new Column { Name = "column2", DType = typeof(double) }) + }; + var dataFrame = new DataFrame(data); + + // Act + var sampledDataFrame = dataFrame.sample(frac: 0.4f, random_state: 1); + var sampledDataFrame2 = dataFrame.sample(frac: 0.4f, random_state: 2); + + // Assert + Assert.Equal(2, sampledDataFrame.index.size); // 40% of 5 is 2 + Assert.True(sampledDataFrame.columns.Where(x => x.Name == "column1").Any()); + Assert.True(sampledDataFrame.columns.Where(x => x.Name == "column2").Any()); + Assert.Equal(2.0D, sampledDataFrame["column1"].GetValue(0)); + Assert.Equal(7.0D, sampledDataFrame["column2"].GetValue(0)); + + // assert for other state + Assert.Equal(2, sampledDataFrame2.index.size); // 40% of 5 is 2 + Assert.Equal(5.0D, sampledDataFrame2["column1"].GetValue(0)); + Assert.Equal(10.0D, sampledDataFrame2["column2"].GetValue(0)); + } + + [Fact] + public void TestSampleMethodThrowsException() + { + // Arrange + List data = new List + { + new Series(new double[] { 1, 2, 3, 4, 5 }, new Column { Name = "column1", DType = typeof(double) }), + new Series(new double[] {6, 7, 8, 9, 10 }, new Column { Name = "column2", DType = typeof(double) }) + }; + var dataFrame = new DataFrame(data); + + // Act & Assert + Assert.Throws(() => dataFrame.sample()); + Assert.Throws(() => dataFrame.sample(n: 3, frac: 0.4f)); + Assert.Throws(() => dataFrame.sample(n: 6)); + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/ExtendedMethods/TypeExtended.test.cs b/test/Pandas.NET.Test/ExtendedMethods/TypeExtended.test.cs new file mode 100644 index 0000000..9827719 --- /dev/null +++ b/test/Pandas.NET.Test/ExtendedMethods/TypeExtended.test.cs @@ -0,0 +1,30 @@ +using System; +using Xunit; + +public class TypeExtendedTests +{ + [Fact] + public void TestIsNumericType() + { + // Arrange + Type intType = typeof(int); + Type nullableIntType = typeof(int?); + Type stringType = typeof(string); + Type doubleType = typeof(double); + Type floatType = typeof(float); + + // Act + bool isIntNumeric = intType.IsNumericType(); + bool isNullableIntNumeric = nullableIntType.IsNumericType(); + bool isStringNumeric = stringType.IsNumericType(); + bool isDoubleNumeric = doubleType.IsNumericType(); + bool isFloatNumeric = floatType.IsNumericType(); + + // Assert + Assert.True(isIntNumeric); + Assert.True(isNullableIntNumeric); + Assert.False(isStringNumeric); + Assert.True(isDoubleNumeric); + Assert.True(isFloatNumeric); + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/GroupTest.cs b/test/Pandas.NET.Test/GroupTest.cs deleted file mode 100644 index 8ffec80..0000000 --- a/test/Pandas.NET.Test/GroupTest.cs +++ /dev/null @@ -1,35 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; -using Xunit; -using PandasNet.Impl; -using NumSharp.Core; - -namespace PandasNet.Test -{ - public class GroupTest - { - [Fact] - public void Create_Group_Test() - { - var dict = new Dictionary - { - { "col1",np.arange(5) }, - { "col2",np.arange(5) }, - { "col3",np.arange(5) }, - { "col4",np.arange(5) }, - { "col5",np.arange(5) }, - }; - - var pd = new Pandas(); - var df1 = pd.DataFrame(dict, new string[] { - "row1","row2","row3","row4","row5" - }); - var gro = df1.groupby("col2"); - var group = gro.Groups; - var a = gro.Groups[1]; - Assert.Equal("row2", a.Values[0]); - - } - } -} diff --git a/test/Pandas.NET.Test/IndexSliceTest.cs b/test/Pandas.NET.Test/IndexSliceTest.cs new file mode 100644 index 0000000..8286ba6 --- /dev/null +++ b/test/Pandas.NET.Test/IndexSliceTest.cs @@ -0,0 +1,14 @@ +using static PandasNet.PandasApi; + +namespace Pandas.Test +{ + public class IndexSliceTest + { + [Fact] + public void Test() + { + var df = pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}"); + var df1 = df[new[] { "col_1" }]; + } + } +} diff --git a/test/Pandas.NET.Test/Methods/PandasTest.arange.cs b/test/Pandas.NET.Test/Methods/PandasTest.arange.cs new file mode 100644 index 0000000..1435de9 --- /dev/null +++ b/test/Pandas.NET.Test/Methods/PandasTest.arange.cs @@ -0,0 +1,33 @@ +using System; +using static PandasNet.PandasApi; + +namespace Pandas.Test; + +public partial class PandasTest +{ + + /// + /// Test arange method for int + /// + [Fact] + public void TestArangeInt() + { + //create an array of 10 elements from 0 to 9 + var arange = pd.arange(0, 10, 1); + Assert.Equal(10, arange.Length); + for (int i = 0; i < arange.Length; i++) + { + Assert.Equal(i, arange[i]); + } + } + + /// + /// Test arange method for float + /// + [Fact] + public void TestArangeFloat() + { + // Only int is supported + Assert.Throws(() => pd.arange(0, 10, 1)); + } +} \ No newline at end of file diff --git a/test/Pandas.NET.Test/Pandas.NET.Test.csproj b/test/Pandas.NET.Test/Pandas.NET.Test.csproj new file mode 100644 index 0000000..e4a70ac --- /dev/null +++ b/test/Pandas.NET.Test/Pandas.NET.Test.csproj @@ -0,0 +1,22 @@ + + + net8.0 + false + Pandas.Test + + + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + \ No newline at end of file diff --git a/test/Pandas.NET.Test/Pandas.Net.Test.csproj b/test/Pandas.NET.Test/Pandas.Net.Test.csproj deleted file mode 100644 index c1d6c32..0000000 --- a/test/Pandas.NET.Test/Pandas.Net.Test.csproj +++ /dev/null @@ -1,22 +0,0 @@ - - - - netcoreapp2.1 - - false - - Pandas.Test - - - - - - - - - - - - - - diff --git a/test/Pandas.NET.Test/SeriesTest.cs b/test/Pandas.NET.Test/SeriesTest.cs deleted file mode 100644 index 18be004..0000000 --- a/test/Pandas.NET.Test/SeriesTest.cs +++ /dev/null @@ -1,73 +0,0 @@ -using PandasNet; -using System; -using Xunit; - -namespace PandasNet.Test -{ - public class SeriesTest - { - private Pandas pd; - public SeriesTest() - { - pd = new Pandas(); - } - - [Fact] - public void CreateSeries_WithArray_Test() - { - var series = pd.Series(new int[] { 1, 2, 3 }); - Assert.Equal(3, series.Size); - Assert.Equal(3, series[2]); - } - - [Fact] - public void CreateSeries_WithObject_Test() - { - var series = pd.Series(new { a = 1, b = "2" }); - Assert.Equal(2, series.Size); - Assert.Equal(1, series["a"]); - Assert.Equal("2", series["b"]); - } - - [Fact] - public void Addition_Test() - { - var series1 = pd.Series(new { a = 1, b = "2" }); - var series2 = pd.Series(new { a = 2, b = 2 }); - var nd = series1 + series2; - Assert.Equal(3m, nd[0]); - Assert.Equal("22", nd[1]); - } - - [Fact] - public void Subtraction() - { - var series1 = pd.Series(new { a = 1, b = "2" }); - var series2 = pd.Series(new { a = 2, b = 2 }); - var nd = series1 - series2; - Assert.Equal(-1m, nd[0]); - Assert.Null(nd[1]); - } - - [Fact] - public void Multiplication_Test() - { - var series1 = pd.Series(new { a = 1, b = "2" }); - var series2 = pd.Series(new { a = 2, b = 2 }); - var nd = series1 * series2; - Assert.Equal(2m, nd[0]); - Assert.Equal(4m, nd[1]); - } - - [Fact] - public void Division_Test() - { - var series1 = pd.Series(new { a = 1, b = "2", c = 1 }); - var series2 = pd.Series(new { a = 2, b = 2, c = 0 }); - var nd = series1 / series2; - Assert.Equal(0.5m, nd[0]); - Assert.Equal(1m, nd[1]); - Assert.Null(nd[2]); - } - } -} diff --git a/test/Pandas.NET.Test/SliceTest.cs b/test/Pandas.NET.Test/SliceTest.cs deleted file mode 100644 index ef96b90..0000000 --- a/test/Pandas.NET.Test/SliceTest.cs +++ /dev/null @@ -1,53 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; -using Xunit; -using PandasNet.Impl; - -namespace PandasNet.Test -{ - public class SliceTest - { - [Fact] - public void Create_implicit() - { - Slice slice = "1:2:3"; - Assert.Equal(1, slice.Start); - Assert.Equal(2, slice.End); - Assert.Equal(3, slice.Step); - - slice = ":2:3"; - Assert.Equal(0, slice.Start); - Assert.Equal(2, slice.End); - Assert.Equal(3, slice.Step); - - slice = "::3"; - Assert.Equal(0, slice.Start); - Assert.Equal(int.MaxValue, slice.End); - Assert.Equal(3, slice.Step); - - slice = ":3"; - Assert.Equal(0, slice.Start); - Assert.Equal(3, slice.End); - Assert.Equal(1, slice.Step); - - slice = "1::"; - Assert.Equal(1, slice.Start); - Assert.Equal(int.MaxValue, slice.End); - Assert.Equal(1, slice.Step); - - slice = "1:"; - Assert.Equal(1, slice.Start); - Assert.Equal(int.MaxValue, slice.End); - Assert.Equal(1, slice.Step); - - slice = "1:2"; - Assert.Equal(1, slice.Start); - Assert.Equal(2, slice.End); - - slice = ":"; - Assert.Equal(0, slice.Start); - Assert.Equal(int.MaxValue, slice.End); - } - } -} diff --git a/test/Pandas.NET.Test/Usings.cs b/test/Pandas.NET.Test/Usings.cs new file mode 100644 index 0000000..8c927eb --- /dev/null +++ b/test/Pandas.NET.Test/Usings.cs @@ -0,0 +1 @@ +global using Xunit; \ No newline at end of file diff --git a/test/PandasConsole/DataFrames/PandasConsole.describe.cs b/test/PandasConsole/DataFrames/PandasConsole.describe.cs new file mode 100644 index 0000000..c50cfdf --- /dev/null +++ b/test/PandasConsole/DataFrames/PandasConsole.describe.cs @@ -0,0 +1,27 @@ +using System.Collections.Generic; +using System.Text.Json; +using PandasNet; +using static PandasNet.PandasApi; + +namespace PandasConsole.Methods +{ + public class PandasConsoleDescribe + { + public DataFrame GetSampleDataFrame() + { + var df = pd.DataFrame.from_dict(JsonSerializer.Serialize(new Dictionary + { + { "col_1",new int[] { 1, 2, 3, 4, 5 }}, + { "col_2", new int[] { 4, 5, 6, 7, 8 }}, + { "col_3 ", new int[] { 7, 8, 9, 10, 11 }} + })); + return df; + } + + public DataFrame DescribeDataFrame() + { + var df = GetSampleDataFrame(); + return df.describe(); + } + } +} diff --git a/test/PandasConsole/DataFrames/PandasConsole.indexers.cs b/test/PandasConsole/DataFrames/PandasConsole.indexers.cs new file mode 100644 index 0000000..95114db --- /dev/null +++ b/test/PandasConsole/DataFrames/PandasConsole.indexers.cs @@ -0,0 +1,28 @@ +using System.Collections.Generic; +using System.Text.Json; +using PandasNet; +using static PandasNet.PandasApi; + +namespace PandasConsole.Methods +{ + public class PandasConsoleIndexers + { + public DataFrame GetSampleDataFrame() + { + var df = pd.DataFrame.from_dict(JsonSerializer.Serialize(new Dictionary + { + { "col_1",new int[] { 1, 2, 3, 4, 5 }}, + { "col_2", new int[] { 4, 5, 6, 7, 8 }}, + { "col_3 ", new int[] { 7, 8, 9, 10, 11 }} + })); + return df; + } + + // index on multiple columns + public (DataFrame, DataFrame) MultiColumnIndexer() + { + var df = GetSampleDataFrame(); + return (df, df["col_1","col_2"]); + } + } +} diff --git a/test/PandasConsole/Methods/PandasConsole.convert.cs b/test/PandasConsole/Methods/PandasConsole.convert.cs new file mode 100644 index 0000000..aabc102 --- /dev/null +++ b/test/PandasConsole/Methods/PandasConsole.convert.cs @@ -0,0 +1,13 @@ +using PandasNet; +using static PandasNet.PandasApi; + +namespace PandasConsole.Methods +{ + public class PandasConsoleConvert + { + public DataFrame GetSampleDataFrame() + { + return pd.DataFrame.from_dict("{'col_1': [3, 2, 1, 0], 'col_2': ['3', '2', '1', '0']}"); + } + } +} diff --git a/test/PandasConsole/PandasConsole.csproj b/test/PandasConsole/PandasConsole.csproj new file mode 100644 index 0000000..4a29496 --- /dev/null +++ b/test/PandasConsole/PandasConsole.csproj @@ -0,0 +1,13 @@ + + + Exe + net8.0 + PandasTest + + + + + + + + \ No newline at end of file diff --git a/test/PandasConsole/Program.cs b/test/PandasConsole/Program.cs new file mode 100644 index 0000000..2367d18 --- /dev/null +++ b/test/PandasConsole/Program.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Data; +using System.Linq; +using CommandLine; +using PandasConsole.Methods; +using Tensorflow; +using static PandasNet.PandasApi; + +namespace PandasConsole +{ + class Program + { + public class Options + { + [Option("info", Required = false, HelpText = "Print the info a DataFrame")] + public bool ConvertSample { get; set; } + + [Option("describe", Required = false, HelpText = "Print the describe a DataFrame")] + public bool DescribeSample { get; set; } + + [Option("multi-column-indexer", Required = false, HelpText = "Print the describe a DataFrame")] + public bool MultiColumnIndexer { get; set; } + } + + + static void Main(string[] args) + { + Parser.Default.ParseArguments(args) + .WithParsed(o => + { + if (o.ConvertSample) + { + CommandRunners.RunInfoSample(); + } + if (o.DescribeSample) + { + CommandRunners.RunDescribeSample(); + } + if (o.MultiColumnIndexer) + { + CommandRunners.RunMultiColumnIndexer(); + } + }) + .WithNotParsed(CommandRunners.HandleParseError); + } + + public static class CommandRunners + { + public static void HandleParseError(IEnumerable errs) + { + Console.WriteLine("An Error Occurred"); + } + public static void RunInfoSample() + { + var converter = new PandasConsoleConvert(); + var df = converter.GetSampleDataFrame(); + Utils.PrintDataFrameInfo(df); + } + + public static void RunDescribeSample() + { + var describe = new PandasConsoleDescribe(); + var df = describe.DescribeDataFrame(); + + Console.Write(Utils.RenderDataTable(Utils.DataFrameToTable(df))); + } + + internal static void RunMultiColumnIndexer() + { + var indexers = new PandasConsoleIndexers(); + var dfTuple = indexers.MultiColumnIndexer(); + + Console.WriteLine("Original DataFrame"); + Console.Write(Utils.RenderDataTable(Utils.DataFrameToTable(dfTuple.Item1))); + + Console.WriteLine("Indexed DataFrame"); + Console.Write(Utils.RenderDataTable(Utils.DataFrameToTable(dfTuple.Item2))); + } + } + } +} diff --git a/test/PandasConsole/README.md b/test/PandasConsole/README.md new file mode 100644 index 0000000..880d84a --- /dev/null +++ b/test/PandasConsole/README.md @@ -0,0 +1,19 @@ +# PandasConsole + +## Usage + +### DotNet CLI + +- Build the Pandas.NET solution +- Run the project (from project root) `dotnet run --project .\test\PandasConsole --help` + +## Implemented Options + +### Info + +> Prints a sample information table similar to calling df.info() in Python but instead rendering as markdown + +| # | Column | Non-Null Count | DType | +| --- | ------ | -------------- | ------------- | +| 0 | col_1 | 4 | System.Int32 | +| 1 | col_2 | 4 | System.String | diff --git a/test/PandasConsole/Utils/PandasConsole.utils.cs b/test/PandasConsole/Utils/PandasConsole.utils.cs new file mode 100644 index 0000000..896bdde --- /dev/null +++ b/test/PandasConsole/Utils/PandasConsole.utils.cs @@ -0,0 +1,139 @@ +using System; +using System.Text; +using PandasNet; +using System.Linq; +using System.Data; +using System.Collections.Generic; + +namespace PandasConsole; +public static class Utils +{ + public static void PrintDataFrameInfo(DataFrame df) + { + var numericTypes = new Type[] { typeof(int[]), typeof(float[]), typeof(double[]) }; + int indexLength = df.shape[0]; + double minIndex = 0; + double maxIndex = 0; + if (numericTypes.Contains(df.index.GetType())) + { + minIndex = df.index.min(); + maxIndex = df.index.max(); + } + int columnCount = df.columns.Count; + + StringBuilder sb = new(); + sb.AppendLine("**Dataframe Info**"); + if (numericTypes.Contains(df.index.GetType())) + { + sb.AppendLine($"Range Index: {indexLength} entries, {minIndex} to {maxIndex}"); + } + else + { + sb.AppendLine($"Index: {indexLength} entries"); + } + sb.AppendLine($"Data Columns: (total {columnCount} columns)"); + Console.WriteLine(sb.ToString()); + + + DataTable table = new DataTable(); + table.Columns.Add("#", typeof(int)); + table.Columns.Add("Column", typeof(string)); + table.Columns.Add("Non-Null Count", typeof(int)); + table.Columns.Add("DType", typeof(string)); + + for (int i = 0; i < df.columns.Count; i++) + { + table.Rows.Add(i, df.columns[i].Name, df[df.columns[i].Name].count(), df[df.columns[i].Name].dtype ?? typeof(String)); + } + Console.Write(RenderDataTable(table)); + } + + public static DataTable DataFrameToTable(DataFrame df) + { + DataTable table = new DataTable(); + table.Columns.Add("#"); + + foreach (var column in df.columns) + { + table.Columns.Add(column.Name, column.DType); + } + + for (var index = 0; index < df.shape[0]; index++) + { + DataRow row = table.NewRow(); + row[0] = df.index.GetValue(index); + for (int i = 0; i < df.columns.Count; i++) + { + row[i + 1] = df[df.columns[i].Name].GetValue(index); + } + table.Rows.Add(row); + } + return table; + } + + /// + /// Renders a DataTable to a markdown table + /// + /// Code from StackOverflow contributor david-liebeherr + /// + /// + public static string RenderDataTable(DataTable table) + { + String GetCellValueAsString(DataRow row, DataColumn column) + { + var cellValue = row[column]; + var cellValueAsString = cellValue is null or DBNull ? "{{null}}" : cellValue.ToString(); + + return cellValueAsString; + } + + var columnWidths = new Dictionary(); + + foreach (DataColumn column in table.Columns) + { + columnWidths.Add(column, column.ColumnName.Length); + } + + foreach (DataRow row in table.Rows) + { + foreach (DataColumn column in table.Columns) + { + columnWidths[column] = Math.Max(columnWidths[column], GetCellValueAsString(row, column).Length); + } + } + + var resultBuilder = new StringBuilder(); + + resultBuilder.Append("| "); + + foreach (DataColumn column in table.Columns) + { + resultBuilder.Append(column.ColumnName.PadRight(columnWidths[column])); + resultBuilder.Append(" | "); + } + + resultBuilder.AppendLine(); + resultBuilder.Append("| "); + foreach (DataColumn column in table.Columns) + { + resultBuilder.Append("-".PadRight(columnWidths[column], '-')); + resultBuilder.Append(" | "); + } + resultBuilder.AppendLine(); + + foreach (DataRow row in table.Rows) + { + resultBuilder.Append("| "); + + foreach (DataColumn column in table.Columns) + { + resultBuilder.Append(GetCellValueAsString(row, column).PadRight(columnWidths[column])); + resultBuilder.Append(" | "); + } + + resultBuilder.AppendLine(); + } + + return resultBuilder.ToString(); + } +} \ No newline at end of file