Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CompStats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = '0.1.10'
__version__ = '0.1.11'
from CompStats.bootstrap import StatisticSamples
from CompStats.measurements import CI, SE, difference_p_value
from CompStats.performance import performance, difference, all_differences, plot_performance, plot_difference
Expand Down
70 changes: 61 additions & 9 deletions CompStats/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from CompStats.utils import progress_bar
from CompStats import measurements
from CompStats.measurements import SE
from CompStats.performance import plot_performance, plot_difference
from CompStats.utils import dataframe


Expand Down Expand Up @@ -248,7 +247,7 @@ def best(self):
else:
self._best = np.array([key] * value.shape[1])
return self._best
BiB = True if self.statistic_samples.BiB else False
BiB = bool(self.statistic_samples.BiB)
keys = np.array(list(self.statistic.keys()))
data = np.asanyarray([self.statistic[k]
for k in keys])
Expand Down Expand Up @@ -338,6 +337,12 @@ def plot(self, value_name:str=None,
CI:float=0.05,
kind:str='point', linestyle:str='none',
col_wrap:int=3, capsize:float=0.2,
comparison:bool=True,
right:bool=True,
comp_legend:str='Comparison',
winner_legend:str='Best',
tie_legend:str='Equivalent',
loser_legend:str='Different',
**kwargs):
"""plot with seaborn

Expand All @@ -363,32 +368,79 @@ def plot(self, value_name:str=None,
value_name = 'Score'
else:
value_name = 'Error'
if not isinstance(self.statistic, dict):
comparison = False
best = self.best
if isinstance(best, np.ndarray):
if best.shape[0] < col_wrap:
col_wrap = best.shape[0]
df = self.dataframe(value_name=value_name, var_name=var_name,
alg_legend=alg_legend, perf_names=perf_names)
alg_legend=alg_legend, perf_names=perf_names,
comparison=comparison, alpha=CI, right=right,
comp_legend=comp_legend,
winner_legend=winner_legend,
tie_legend=tie_legend,
loser_legend=loser_legend)
if var_name not in df.columns:
var_name = None
col_wrap = None
ci = lambda x: measurements.CI(x, alpha=CI)
if comparison:
kwargs.update(dict(hue=comp_legend))
f_grid = sns.catplot(df, x=value_name, errorbar=ci,
y=alg_legend, col=var_name,
kind=kind, linestyle=linestyle,
col_wrap=col_wrap, capsize=capsize, **kwargs)
return f_grid


def dataframe(self, value_name:str='Score',
def dataframe(self, comparison:bool=False,
right:bool=True,
alpha:float=0.05,
value_name:str='Score',
var_name:str='Performance',
alg_legend:str='Algorithm',
comp_legend:str='Comparison',
winner_legend:str='Best',
tie_legend:str='Equivalent',
loser_legend:str='Different',
perf_names:str=None):
"""Dataframe"""
if perf_names is None and isinstance(self.best, np.ndarray):
func_name = self.statistic_func.__name__
perf_names = [f'{func_name}({i})'
for i, k in enumerate(self.best)]
return dataframe(self, value_name=value_name,
var_name=var_name,
alg_legend=alg_legend,
perf_names=perf_names)
df = dataframe(self, value_name=value_name,
var_name=var_name,
alg_legend=alg_legend,
perf_names=perf_names)
if not comparison:
return df
df[comp_legend] = tie_legend
diff = self.difference()
best = self.best
if isinstance(best, str):
for name, p in diff.p_value(right=right).items():
if p >= alpha:
continue
df.loc[df[alg_legend] == name, comp_legend] = loser_legend
df.loc[df[alg_legend] == best, comp_legend] = winner_legend
else:
p_values = diff.p_value(right=right)
systems = list(p_values.keys())
p_values = np.array([p_values[k] for k in systems])
for name, p_value, winner in zip(perf_names,
p_values.T,
best):
mask = df[var_name] == name
for alg, p in zip(systems, p_value):
if p >= alpha and winner != alg:
continue
_ = mask & (df[alg_legend] == alg)
if winner == alg:
df.loc[_, comp_legend] = winner_legend
else:
df.loc[_, comp_legend] = loser_legend
return df

@property
def n_jobs(self):
Expand Down
20 changes: 19 additions & 1 deletion CompStats/tests/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,30 @@
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris, load_digits
from sklearn.datasets import load_iris, load_digits, load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd
from CompStats.tests.test_performance import DATA


def test_Perf_plot_col_wrap():
"""Test plot when 2 classes"""
from CompStats.metrics import f1_score

X, y = load_breast_cancer(return_X_y=True)
_ = train_test_split(X, y, test_size=0.3)
X_train, X_val, y_train, y_val = _
ens = RandomForestClassifier().fit(X_train, y_train)
nb = GaussianNB().fit(X_train, y_train)
svm = LinearSVC().fit(X_train, y_train)
score = f1_score(y_val, ens.predict(X_val),
average=None,
num_samples=50)
score(nb.predict(X_val))
score(svm.predict(X_val))
score.plot()


def test_Difference_dataframe():
"""Test Difference dataframe"""
from CompStats.metrics import f1_score
Expand Down
34 changes: 21 additions & 13 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ CompStats

Collaborative competitions have gained popularity in the scientific and technological fields. These competitions involve defining tasks, selecting evaluation scores, and devising result verification methods. In the standard scenario, participants receive a training set and are expected to provide a solution for a held-out dataset kept by organizers. An essential challenge for organizers arises when comparing algorithms' performance, assessing multiple participants, and ranking them. Statistical tools are often used for this purpose; however, traditional statistical methods often fail to capture decisive differences between systems' performance. CompStats implements an evaluation methodology for statistically analyzing competition results and competition. CompStats offers several advantages, including off-the-shell comparisons with correction mechanisms and the inclusion of confidence intervals.

To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), three different classifiers, and the last line is the score used to measure the performance and compare the algorithm.
To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), four different classifiers, and the last line is the score used to measure the performance and compare the algorithm.

>>> from sklearn.svm import LinearSVC
>>> from sklearn.naive_bayes import GaussianNB
Expand All @@ -51,10 +51,10 @@ Once the predictions are available, it is time to measure the algorithm's perfor
>>> score
<Perf(score_func=f1_score, statistic=0.9435, se=0.0099)>

The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the `statistic` function.
The previous code shows the macro-f1 score and its standard error. The actual performance value is stored in the attributes `statistic` function, and `se`

>>> score.statistic
0.9434834454375508
>>> score.statistic, score.se
(0.9521479775366307, 0.009717884979482313)

Continuing with the example, let us assume that one wants to test another classifier on the same problem, in this case, a random forest, as can be seen in the following two lines. The second line predicts the validation set and sets it to the analysis.

Expand All @@ -63,28 +63,36 @@ Continuing with the example, let us assume that one wants to test another classi
<Perf(score_func=f1_score)>
Statistic with its standard error (se)
statistic (se)
0.9655 (0.0077) <= Random Forest
0.9435 (0.0099) <= alg-1
0.9720 (0.0076) <= Random Forest
0.9521 (0.0097) <= alg-1

Let us incorporate another prediction, now with the Naive Bayes classifier, as seen below.
Let us incorporate another predictions, now with Naive Bayes classifier, and Histogram Gradient Boosting as seen below.

>>> nb = GaussianNB().fit(X_train, y_train)
>>> score(nb.predict(X_val), name='Naive Bayes')
<Perf(score_func=f1_score)>
Statistic with its standard error (se)
statistic (se)
0.9655 (0.0077) <= Random Forest
0.9435 (0.0099) <= alg-1
0.8549 (0.0153) <= Naive Bayes
0.9759 (0.0068) <= Hist. Grad. Boost. Tree
0.9720 (0.0076) <= Random Forest
0.9521 (0.0097) <= alg-1
0.8266 (0.0159) <= Naive Bayes

The final step is to compare the performance of the three classifiers, which can be done with the `difference` method, as seen next.
The performance, its confidence interval (5%), and a statistical comparison (5%) between the best performing system with the rest of the algorithms is depicted in the following figure.

>>> score.plot()

.. image:: https://github.com/INGEOTEC/CompStats/raw/docs/docs/source/digits_perf.png

The final step is to compare the performance of the four classifiers, which can be done with the `difference` method, as seen next.

>>> diff = score.difference()
>>> diff
<Difference>
difference p-values w.r.t Random Forest
difference p-values w.r.t Hist. Grad. Boost. Tree
0.0000 <= Naive Bayes
0.0120 <= alg-1
0.0100 <= alg-1
0.3240 <= Random Forest

The class `Difference` has the `plot` method that can be used to depict the difference with respect to the best.

Expand Down
Loading
Loading