From a67a974bd00eb6b858c5ebe46e801fa1d9b506a5 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Fri, 30 Jan 2026 15:11:11 -0600 Subject: [PATCH 1/7] Add CountDistinct method in GroupBy --- src/Microsoft.Data.Analysis/GroupBy.cs | 54 +++++++++++++++++++ .../DataFrameTests.cs | 38 +++++++++++++ 2 files changed, 92 insertions(+) diff --git a/src/Microsoft.Data.Analysis/GroupBy.cs b/src/Microsoft.Data.Analysis/GroupBy.cs index 357fa80a63..206341e478 100644 --- a/src/Microsoft.Data.Analysis/GroupBy.cs +++ b/src/Microsoft.Data.Analysis/GroupBy.cs @@ -21,6 +21,12 @@ public abstract class GroupBy /// public abstract DataFrame Count(params string[] columnNames); + /// + /// Compute the number of distinct non-null values in each group + /// + /// + public abstract DataFrame CountDistinct(params string[] columnNames); + /// /// Return the first value in each group /// @@ -182,6 +188,54 @@ public override DataFrame Count(params string[] columnNames) return ret; } + public override DataFrame CountDistinct(params string[] columnNames) + { + DataFrame ret = new DataFrame(); + PrimitiveDataFrameColumn empty = new PrimitiveDataFrameColumn("Empty"); + DataFrameColumn firstColumn = _dataFrame.Columns[_groupByColumnIndex].Clone(empty); + ret.Columns.Insert(ret.Columns.Count, firstColumn); + GroupByColumnDelegate groupByColumnDelegate = new GroupByColumnDelegate((long rowIndex, TKey key) => + { + firstColumn.Resize(rowIndex + 1); + firstColumn[rowIndex] = key; + }); + ColumnDelegate columnDelegate = new ColumnDelegate((int columnIndex, long rowIndex, ICollection rowEnumerable, TKey key, bool firstGroup) => + { + if (columnIndex == _groupByColumnIndex) + return; + DataFrameColumn column = _dataFrame.Columns[columnIndex]; + long count = 0; + HashSet seenValues = []; + foreach (long row in rowEnumerable) + { + var rowValue = column[row]; + if (rowValue != null && !seenValues.Contains(rowValue)) + { + seenValues.Add(rowValue); + count++; + } + } + DataFrameColumn retColumn; + if (firstGroup) + { + retColumn = new PrimitiveDataFrameColumn(column.Name); + ret.Columns.Insert(ret.Columns.Count, retColumn); + } + else + { + // Assuming non duplicate column names + retColumn = ret.Columns[column.Name]; + } + retColumn.Resize(rowIndex + 1); + retColumn[rowIndex] = count; + }); + + EnumerateColumnsWithRows(groupByColumnDelegate, columnDelegate, columnNames); + ret.SetTableRowCount(firstColumn.Length); + + return ret; + } + public override DataFrame First(params string[] columnNames) { DataFrame ret = new DataFrame(); diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 2d75caef72..47ff904dce 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -429,6 +429,44 @@ public void TestGroupBy() Assert.Equal(2, firstDecimalColumn.Rows.Count); Assert.Equal((decimal)0, firstDecimalColumn.Columns["Decimal"][0]); Assert.Equal((decimal)1, firstDecimalColumn.Columns["Decimal"][1]); + + var dfWithDuplicates = new DataFrame( + new Int32DataFrameColumn("Group", [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]), + new Int32DataFrameColumn("Int", [1, 2, 3, 4, null, 1, 1, 2, 3, 4]), + new DoubleDataFrameColumn("Double", [1, 2, 3, 4, null, 1, 1, 2, 3, 4]), + new StringDataFrameColumn("String", ["1", "2", "3", "4", null, "1", "1", "2", "3", "4"]), + new DateTimeDataFrameColumn("DateTime", [ + new DateTime(2026, 1, 1, 0, 0, 0), + new DateTime(2026, 1, 1, 0, 0, 1), + new DateTime(2026, 1, 1, 0, 0, 2), + new DateTime(2026, 1, 1, 0, 0, 3), + null, + new DateTime(2026, 1, 1, 0, 0, 0), + new DateTime(2026, 1, 1, 0, 0, 0), + new DateTime(2026, 1, 1, 0, 0, 1), + new DateTime(2026, 1, 1, 0, 0, 2), + new DateTime(2026, 1, 1, 0, 0, 3) + ]) + ); + + DataFrame countDistinct = dfWithDuplicates.GroupBy("Group").CountDistinct(); + Assert.Equal(5, countDistinct.Columns.Count); + Assert.Equal(2, countDistinct.Rows.Count); + + foreach (var columnName in countDistinct.Columns.Select(c => c.Name)) + { + if (columnName == "Group") + { + continue; + } + + var column = (PrimitiveDataFrameColumn)countDistinct[columnName]; + + for (int row = 0; row < countDistinct.Rows.Count; row++) + { + Assert.Equal(4, column[row]); + } + } } [Fact] From 6e27c1eb1fb73c40c9007c04eb175b2402c818ec Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Fri, 30 Jan 2026 16:20:01 -0600 Subject: [PATCH 2/7] Add CountIf implementation to GroupBy --- src/Microsoft.Data.Analysis/GroupBy.cs | 97 +++++++++++++++----------- 1 file changed, 56 insertions(+), 41 deletions(-) diff --git a/src/Microsoft.Data.Analysis/GroupBy.cs b/src/Microsoft.Data.Analysis/GroupBy.cs index 206341e478..e9adc11650 100644 --- a/src/Microsoft.Data.Analysis/GroupBy.cs +++ b/src/Microsoft.Data.Analysis/GroupBy.cs @@ -9,6 +9,24 @@ namespace Microsoft.Data.Analysis { + public record GroupByPredicateInput + { + /// + /// The name of the column that is being aggregated + /// + public string ColumnName { get; set; } + + /// + /// The value from the GroupBy column that this group is grouped on + /// + public object GroupKey { get; set; } + + /// + /// The value of this row within the column that is being aggregated + /// + public object RowValue { get; set; } + } + /// /// A GroupBy class that is typically the result of a DataFrame.GroupBy call. /// It holds information to perform typical aggregation ops on it. @@ -16,20 +34,31 @@ namespace Microsoft.Data.Analysis public abstract class GroupBy { /// - /// Compute the number of non-null values in each group + /// Compute the number of non-null values in each group /// + /// The columns within which to compute the number of non-null values in each group. A default value includes all columns. /// public abstract DataFrame Count(params string[] columnNames); + /// + /// Compute the number of values in each group that match a custom predicate + /// + /// A function that takes in the column name, group key, and row value and returns true to include that row in the group count or false to exclude it. + /// The columns within which to compute the number of values in each group that match the predicate. A default value includes all columns. + /// + public abstract DataFrame CountIf(Func predicate, params string[] columnNames); + /// /// Compute the number of distinct non-null values in each group /// + /// The columns within which to compute the number of distinct non-null values in each group. A default value includes all columns. /// public abstract DataFrame CountDistinct(params string[] columnNames); /// /// Return the first value in each group /// + /// Names of the columns to aggregate /// public abstract DataFrame First(params string[] columnNames); @@ -146,6 +175,11 @@ private void EnumerateColumnsWithRows(GroupByColumnDelegate groupByColumnDelegat } public override DataFrame Count(params string[] columnNames) + { + return CountIf(input => input.RowValue != null, columnNames); + } + + public override DataFrame CountIf(Func predicate, params string[] columnNames) { DataFrame ret = new DataFrame(); PrimitiveDataFrameColumn empty = new PrimitiveDataFrameColumn("Empty"); @@ -162,10 +196,19 @@ public override DataFrame Count(params string[] columnNames) return; DataFrameColumn column = _dataFrame.Columns[columnIndex]; long count = 0; + var groupByPredicateInput = new GroupByPredicateInput + { + ColumnName = column.Name, + GroupKey = firstColumn[rowIndex] + }; foreach (long row in rowEnumerable) { - if (column[row] != null) + groupByPredicateInput.RowValue = column[row]; + + if (predicate(groupByPredicateInput)) + { count++; + } } DataFrameColumn retColumn; if (firstGroup) @@ -190,50 +233,22 @@ public override DataFrame Count(params string[] columnNames) public override DataFrame CountDistinct(params string[] columnNames) { - DataFrame ret = new DataFrame(); - PrimitiveDataFrameColumn empty = new PrimitiveDataFrameColumn("Empty"); - DataFrameColumn firstColumn = _dataFrame.Columns[_groupByColumnIndex].Clone(empty); - ret.Columns.Insert(ret.Columns.Count, firstColumn); - GroupByColumnDelegate groupByColumnDelegate = new GroupByColumnDelegate((long rowIndex, TKey key) => - { - firstColumn.Resize(rowIndex + 1); - firstColumn[rowIndex] = key; - }); - ColumnDelegate columnDelegate = new ColumnDelegate((int columnIndex, long rowIndex, ICollection rowEnumerable, TKey key, bool firstGroup) => - { - if (columnIndex == _groupByColumnIndex) - return; - DataFrameColumn column = _dataFrame.Columns[columnIndex]; - long count = 0; - HashSet seenValues = []; - foreach (long row in rowEnumerable) + HashSet seenValues = []; + + return CountIf( + input => { - var rowValue = column[row]; - if (rowValue != null && !seenValues.Contains(rowValue)) + if (input.RowValue == null || seenValues.Contains(input)) { - seenValues.Add(rowValue); - count++; + return false; } - } - DataFrameColumn retColumn; - if (firstGroup) - { - retColumn = new PrimitiveDataFrameColumn(column.Name); - ret.Columns.Insert(ret.Columns.Count, retColumn); - } - else - { - // Assuming non duplicate column names - retColumn = ret.Columns[column.Name]; - } - retColumn.Resize(rowIndex + 1); - retColumn[rowIndex] = count; - }); - EnumerateColumnsWithRows(groupByColumnDelegate, columnDelegate, columnNames); - ret.SetTableRowCount(firstColumn.Length); + seenValues.Add(input); - return ret; + return true; + }, + columnNames + ); } public override DataFrame First(params string[] columnNames) From d9364770417dd0bd4a5615c1169351d6aff6885f Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Fri, 30 Jan 2026 16:28:26 -0600 Subject: [PATCH 3/7] Add test for CountIf --- test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 47ff904dce..85e851137f 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -467,6 +467,13 @@ public void TestGroupBy() Assert.Equal(4, column[row]); } } + + DataFrame countIf = dfWithDuplicates.GroupBy("Group").CountIf(input => input.RowValue is int and < 3, "Int"); + Assert.Equal(2, countIf.Columns.Count); + Assert.Equal(2, countIf.Rows.Count); + Assert.Equal(2L, countIf["Int"][0]); + Assert.Equal(3L, countIf["Int"][1]); + } [Fact] From aaa864653facc50b97cf7b88df98aa6e5abdfd3f Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 2 Feb 2026 16:26:32 -0600 Subject: [PATCH 4/7] Add test for GroupByPredicateInput getters and setters --- .../DataFrameTests.cs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 85e851137f..75f0ee81ec 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -473,7 +473,21 @@ public void TestGroupBy() Assert.Equal(2, countIf.Rows.Count); Assert.Equal(2L, countIf["Int"][0]); Assert.Equal(3L, countIf["Int"][1]); + } + [Fact] + public void TestGroupByPredicateInput() + { + var input = new GroupByPredicateInput + { + ColumnName = "TestColumn", + GroupKey = "TestKey", + RowValue = 123 + }; + + Assert.Equal("TestColumn", input.ColumnName); + Assert.Equal("TestKey", input.GroupKey); + Assert.Equal(123, input.RowValue); } [Fact] From e89130b70392d50b2d7fea7a1d7f7ba4472ad013 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Tue, 3 Feb 2026 12:23:44 -0600 Subject: [PATCH 5/7] Revert "Add test for GroupByPredicateInput getters and setters" This reverts commit aaa864653facc50b97cf7b88df98aa6e5abdfd3f. --- .../DataFrameTests.cs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 75f0ee81ec..85e851137f 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -473,21 +473,7 @@ public void TestGroupBy() Assert.Equal(2, countIf.Rows.Count); Assert.Equal(2L, countIf["Int"][0]); Assert.Equal(3L, countIf["Int"][1]); - } - [Fact] - public void TestGroupByPredicateInput() - { - var input = new GroupByPredicateInput - { - ColumnName = "TestColumn", - GroupKey = "TestKey", - RowValue = 123 - }; - - Assert.Equal("TestColumn", input.ColumnName); - Assert.Equal("TestKey", input.GroupKey); - Assert.Equal(123, input.RowValue); } [Fact] From c3232ddcc55a691fdde6a0e5ca0a207b88d7b7e0 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Tue, 3 Feb 2026 12:26:09 -0600 Subject: [PATCH 6/7] Add documentation for GroupByPredicateInput record. --- src/Microsoft.Data.Analysis/GroupBy.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Microsoft.Data.Analysis/GroupBy.cs b/src/Microsoft.Data.Analysis/GroupBy.cs index e9adc11650..93aec05b49 100644 --- a/src/Microsoft.Data.Analysis/GroupBy.cs +++ b/src/Microsoft.Data.Analysis/GroupBy.cs @@ -9,6 +9,9 @@ namespace Microsoft.Data.Analysis { + /// + /// A record to identify the row that is being aggregated that can be used to decide whether or not to include it in the aggregation. + /// public record GroupByPredicateInput { /// From 2e6f45e51e52876f97e9ec42a7775cc35a3140b0 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Tue, 3 Feb 2026 12:30:32 -0600 Subject: [PATCH 7/7] Specify input type explicitly --- test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 85e851137f..6320d15aba 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -468,12 +468,11 @@ public void TestGroupBy() } } - DataFrame countIf = dfWithDuplicates.GroupBy("Group").CountIf(input => input.RowValue is int and < 3, "Int"); + DataFrame countIf = dfWithDuplicates.GroupBy("Group").CountIf((GroupByPredicateInput input) => input.RowValue is int and < 3, "Int"); Assert.Equal(2, countIf.Columns.Count); Assert.Equal(2, countIf.Rows.Count); Assert.Equal(2L, countIf["Int"][0]); Assert.Equal(3L, countIf["Int"][1]); - } [Fact]