diff --git a/CompStats/__init__.py b/CompStats/__init__.py index 8c23749..304ae65 100644 --- a/CompStats/__init__.py +++ b/CompStats/__init__.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '0.1.10' +__version__ = '0.1.11' from CompStats.bootstrap import StatisticSamples from CompStats.measurements import CI, SE, difference_p_value from CompStats.performance import performance, difference, all_differences, plot_performance, plot_difference diff --git a/CompStats/interface.py b/CompStats/interface.py index f1143a3..a4262fb 100644 --- a/CompStats/interface.py +++ b/CompStats/interface.py @@ -20,7 +20,6 @@ from CompStats.utils import progress_bar from CompStats import measurements from CompStats.measurements import SE -from CompStats.performance import plot_performance, plot_difference from CompStats.utils import dataframe @@ -248,7 +247,7 @@ def best(self): else: self._best = np.array([key] * value.shape[1]) return self._best - BiB = True if self.statistic_samples.BiB else False + BiB = bool(self.statistic_samples.BiB) keys = np.array(list(self.statistic.keys())) data = np.asanyarray([self.statistic[k] for k in keys]) @@ -338,6 +337,12 @@ def plot(self, value_name:str=None, CI:float=0.05, kind:str='point', linestyle:str='none', col_wrap:int=3, capsize:float=0.2, + comparison:bool=True, + right:bool=True, + comp_legend:str='Comparison', + winner_legend:str='Best', + tie_legend:str='Equivalent', + loser_legend:str='Different', **kwargs): """plot with seaborn @@ -363,32 +368,79 @@ def plot(self, value_name:str=None, value_name = 'Score' else: value_name = 'Error' + if not isinstance(self.statistic, dict): + comparison = False + best = self.best + if isinstance(best, np.ndarray): + if best.shape[0] < col_wrap: + col_wrap = best.shape[0] df = self.dataframe(value_name=value_name, var_name=var_name, - alg_legend=alg_legend, perf_names=perf_names) + alg_legend=alg_legend, perf_names=perf_names, + comparison=comparison, alpha=CI, right=right, + comp_legend=comp_legend, + winner_legend=winner_legend, + tie_legend=tie_legend, + loser_legend=loser_legend) if var_name not in df.columns: var_name = None col_wrap = None ci = lambda x: measurements.CI(x, alpha=CI) + if comparison: + kwargs.update(dict(hue=comp_legend)) f_grid = sns.catplot(df, x=value_name, errorbar=ci, y=alg_legend, col=var_name, kind=kind, linestyle=linestyle, col_wrap=col_wrap, capsize=capsize, **kwargs) return f_grid - - def dataframe(self, value_name:str='Score', + def dataframe(self, comparison:bool=False, + right:bool=True, + alpha:float=0.05, + value_name:str='Score', var_name:str='Performance', alg_legend:str='Algorithm', + comp_legend:str='Comparison', + winner_legend:str='Best', + tie_legend:str='Equivalent', + loser_legend:str='Different', perf_names:str=None): """Dataframe""" if perf_names is None and isinstance(self.best, np.ndarray): func_name = self.statistic_func.__name__ perf_names = [f'{func_name}({i})' for i, k in enumerate(self.best)] - return dataframe(self, value_name=value_name, - var_name=var_name, - alg_legend=alg_legend, - perf_names=perf_names) + df = dataframe(self, value_name=value_name, + var_name=var_name, + alg_legend=alg_legend, + perf_names=perf_names) + if not comparison: + return df + df[comp_legend] = tie_legend + diff = self.difference() + best = self.best + if isinstance(best, str): + for name, p in diff.p_value(right=right).items(): + if p >= alpha: + continue + df.loc[df[alg_legend] == name, comp_legend] = loser_legend + df.loc[df[alg_legend] == best, comp_legend] = winner_legend + else: + p_values = diff.p_value(right=right) + systems = list(p_values.keys()) + p_values = np.array([p_values[k] for k in systems]) + for name, p_value, winner in zip(perf_names, + p_values.T, + best): + mask = df[var_name] == name + for alg, p in zip(systems, p_value): + if p >= alpha and winner != alg: + continue + _ = mask & (df[alg_legend] == alg) + if winner == alg: + df.loc[_, comp_legend] = winner_legend + else: + df.loc[_, comp_legend] = loser_legend + return df @property def n_jobs(self): diff --git a/CompStats/tests/test_interface.py b/CompStats/tests/test_interface.py index feebee9..f2d02c4 100644 --- a/CompStats/tests/test_interface.py +++ b/CompStats/tests/test_interface.py @@ -17,12 +17,30 @@ from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB -from sklearn.datasets import load_iris, load_digits +from sklearn.datasets import load_iris, load_digits, load_breast_cancer from sklearn.model_selection import train_test_split import pandas as pd from CompStats.tests.test_performance import DATA +def test_Perf_plot_col_wrap(): + """Test plot when 2 classes""" + from CompStats.metrics import f1_score + + X, y = load_breast_cancer(return_X_y=True) + _ = train_test_split(X, y, test_size=0.3) + X_train, X_val, y_train, y_val = _ + ens = RandomForestClassifier().fit(X_train, y_train) + nb = GaussianNB().fit(X_train, y_train) + svm = LinearSVC().fit(X_train, y_train) + score = f1_score(y_val, ens.predict(X_val), + average=None, + num_samples=50) + score(nb.predict(X_val)) + score(svm.predict(X_val)) + score.plot() + + def test_Difference_dataframe(): """Test Difference dataframe""" from CompStats.metrics import f1_score diff --git a/README.rst b/README.rst index 7e124c8..4ac5f92 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,7 @@ CompStats Collaborative competitions have gained popularity in the scientific and technological fields. These competitions involve defining tasks, selecting evaluation scores, and devising result verification methods. In the standard scenario, participants receive a training set and are expected to provide a solution for a held-out dataset kept by organizers. An essential challenge for organizers arises when comparing algorithms' performance, assessing multiple participants, and ranking them. Statistical tools are often used for this purpose; however, traditional statistical methods often fail to capture decisive differences between systems' performance. CompStats implements an evaluation methodology for statistically analyzing competition results and competition. CompStats offers several advantages, including off-the-shell comparisons with correction mechanisms and the inclusion of confidence intervals. -To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), three different classifiers, and the last line is the score used to measure the performance and compare the algorithm. +To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), four different classifiers, and the last line is the score used to measure the performance and compare the algorithm. >>> from sklearn.svm import LinearSVC >>> from sklearn.naive_bayes import GaussianNB @@ -51,10 +51,10 @@ Once the predictions are available, it is time to measure the algorithm's perfor >>> score -The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the `statistic` function. +The previous code shows the macro-f1 score and its standard error. The actual performance value is stored in the attributes `statistic` function, and `se` ->>> score.statistic -0.9434834454375508 +>>> score.statistic, score.se +(0.9521479775366307, 0.009717884979482313) Continuing with the example, let us assume that one wants to test another classifier on the same problem, in this case, a random forest, as can be seen in the following two lines. The second line predicts the validation set and sets it to the analysis. @@ -63,28 +63,36 @@ Continuing with the example, let us assume that one wants to test another classi Statistic with its standard error (se) statistic (se) -0.9655 (0.0077) <= Random Forest -0.9435 (0.0099) <= alg-1 +0.9720 (0.0076) <= Random Forest +0.9521 (0.0097) <= alg-1 -Let us incorporate another prediction, now with the Naive Bayes classifier, as seen below. +Let us incorporate another predictions, now with Naive Bayes classifier, and Histogram Gradient Boosting as seen below. >>> nb = GaussianNB().fit(X_train, y_train) >>> score(nb.predict(X_val), name='Naive Bayes') Statistic with its standard error (se) statistic (se) -0.9655 (0.0077) <= Random Forest -0.9435 (0.0099) <= alg-1 -0.8549 (0.0153) <= Naive Bayes +0.9759 (0.0068) <= Hist. Grad. Boost. Tree +0.9720 (0.0076) <= Random Forest +0.9521 (0.0097) <= alg-1 +0.8266 (0.0159) <= Naive Bayes -The final step is to compare the performance of the three classifiers, which can be done with the `difference` method, as seen next. +The performance, its confidence interval (5%), and a statistical comparison (5%) between the best performing system with the rest of the algorithms is depicted in the following figure. + +>>> score.plot() + +.. image:: https://github.com/INGEOTEC/CompStats/raw/docs/docs/source/digits_perf.png + +The final step is to compare the performance of the four classifiers, which can be done with the `difference` method, as seen next. >>> diff = score.difference() >>> diff -difference p-values w.r.t Random Forest +difference p-values w.r.t Hist. Grad. Boost. Tree 0.0000 <= Naive Bayes -0.0120 <= alg-1 +0.0100 <= alg-1 +0.3240 <= Random Forest The class `Difference` has the `plot` method that can be used to depict the difference with respect to the best. diff --git a/docs/CompStats_metrics.ipynb b/docs/CompStats_metrics.ipynb index 2527d84..28c8838 100644 --- a/docs/CompStats_metrics.ipynb +++ b/docs/CompStats_metrics.ipynb @@ -48,58 +48,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { - "id": "I1B4Ktin2VfE", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "b37f28d5-90d9-4d94-c28c-27f9054f1a23" + "id": "I1B4Ktin2VfE" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting git+https://github.com/INGEOTEC/CompStats@develop\n", - " Cloning https://github.com/INGEOTEC/CompStats (to revision develop) to /tmp/pip-req-build-yb73d9s4\n", - " Running command git clone --filter=blob:none --quiet https://github.com/INGEOTEC/CompStats /tmp/pip-req-build-yb73d9s4\n", - " Running command git checkout -b develop --track origin/develop\n", - " Switched to a new branch 'develop'\n", - " Branch 'develop' set up to track remote branch 'develop' from 'origin'.\n", - " Resolved https://github.com/INGEOTEC/CompStats to commit 438a8055b71bba437bad7bd1ef5427b29e0ed245\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from CompStats==0.1.7) (1.26.4)\n", - "Requirement already satisfied: scikit-learn>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from CompStats==0.1.7) (1.6.1)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from CompStats==0.1.7) (2.2.2)\n", - "Requirement already satisfied: seaborn>=0.13.0 in /usr/local/lib/python3.11/dist-packages (from CompStats==0.1.7) (0.13.2)\n", - "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=1.3.0->CompStats==0.1.7) (1.13.1)\n", - "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=1.3.0->CompStats==0.1.7) (1.4.2)\n", - "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=1.3.0->CompStats==0.1.7) (3.5.0)\n", - "Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /usr/local/lib/python3.11/dist-packages (from seaborn>=0.13.0->CompStats==0.1.7) (3.10.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->CompStats==0.1.7) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->CompStats==0.1.7) (2025.1)\n", - "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->CompStats==0.1.7) (2025.1)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn>=0.13.0->CompStats==0.1.7) (1.3.1)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn>=0.13.0->CompStats==0.1.7) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn>=0.13.0->CompStats==0.1.7) (4.56.0)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn>=0.13.0->CompStats==0.1.7) (1.4.8)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn>=0.13.0->CompStats==0.1.7) (24.2)\n", - "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn>=0.13.0->CompStats==0.1.7) (11.1.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn>=0.13.0->CompStats==0.1.7) (3.2.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->CompStats==0.1.7) (1.17.0)\n", - "Building wheels for collected packages: CompStats\n", - " Building wheel for CompStats (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for CompStats: filename=CompStats-0.1.7-py3-none-any.whl size=41028 sha256=e70584ada7f0c49c8768febba12e978c4e122d93bf8452e2d42ea190cc5b1ece\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-sgv6titu/wheels/4f/d2/a1/8d1d30289bd99417ea947fc1e1f4587404d4e3a043b41f0289\n", - "Successfully built CompStats\n", - "Installing collected packages: CompStats\n", - "Successfully installed CompStats-0.1.7\n" - ] - } - ], + "outputs": [], "source": [ "try:\n", " import CompStats\n", @@ -122,7 +75,7 @@ "source": [ "`CompStats.metrics` aims to facilitate performance measurement (with standard errors and confidence intervals) and statistical comparisons between algorithms on a single problem, wrapping the different scores and loss functions found on `metrics`.\n", "\n", - "To illustrate the use of `CompStats.metrics`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), three different classifiers, and the last line is the score used to measure the performance and compare the algorithm." + "To illustrate the use of `CompStats.metrics`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), four different classifiers, and the last line is the score used to measure the performance and compare the algorithm." ], "metadata": { "id": "ZyRCAFoJ3LzP" @@ -133,7 +86,7 @@ "source": [ "from sklearn.svm import LinearSVC\n", "from sklearn.naive_bayes import GaussianNB\n", - "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier\n", "from sklearn.datasets import load_digits\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.base import clone\n", @@ -142,7 +95,7 @@ "metadata": { "id": "jEpd52Kq214r" }, - "execution_count": 2, + "execution_count": 12, "outputs": [] }, { @@ -166,7 +119,7 @@ "metadata": { "id": "JGJczaOW3WeK" }, - "execution_count": 3, + "execution_count": 13, "outputs": [] }, { @@ -189,33 +142,33 @@ "base_uri": "https://localhost:8080/" }, "id": "Al0u9ZPB3cSj", - "outputId": "33d3dc5e-d7c7-4a2e-a3af-284a99935733" + "outputId": "6eb7e800-b835-46af-84d8-16c3ad214d58" }, - "execution_count": 4, + "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ - "100%|██████████| 1/1 [00:05<00:00, 5.46s/it]\n" + "100%|██████████| 1/1 [00:00<00:00, 1.03it/s]\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ - "" + "" ] }, "metadata": {}, - "execution_count": 4 + "execution_count": 14 } ] }, { "cell_type": "markdown", "source": [ - "The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the `Perf.statistic` function." + "The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the attributes `statistic` function, and `se`." ], "metadata": { "id": "OV5-SrTh3loq" @@ -224,26 +177,26 @@ { "cell_type": "code", "source": [ - "score.statistic" + "score.statistic, score.se" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ye1HH4pn3jde", - "outputId": "a0f19b8f-eeb4-46ba-d0a1-8f3d59b67527" + "outputId": "3c7c9e30-06fa-4e62-b0ea-6eb53012af01" }, - "execution_count": 5, + "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "0.9434834454375508" + "(0.9521479775366307, 0.009717884979482313)" ] }, "metadata": {}, - "execution_count": 5 + "execution_count": 15 } ] }, @@ -267,15 +220,15 @@ "base_uri": "https://localhost:8080/" }, "id": "vboh7N9B3pDr", - "outputId": "980e3a62-1577-425d-ed59-69fbb93c4945" + "outputId": "73b9de04-f3b7-4a17-d2a7-09dc453c8f4e" }, - "execution_count": 6, + "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ - "100%|██████████| 1/1 [00:01<00:00, 1.04s/it]\n" + "100%|██████████| 1/1 [00:00<00:00, 1.03it/s]\n" ] }, { @@ -285,19 +238,19 @@ "\n", "Statistic with its standard error (se)\n", "statistic (se)\n", - "0.9655 (0.0077) <= Random Forest\n", - "0.9435 (0.0099) <= alg-1" + "0.9720 (0.0076) <= Random Forest\n", + "0.9521 (0.0097) <= alg-1" ] }, "metadata": {}, - "execution_count": 6 + "execution_count": 16 } ] }, { "cell_type": "markdown", "source": [ - "Let us incorporate another prediction, now with the Naive Bayes classifier, as seen below." + "Let us incorporate another predictions, now with Naive Bayes classifier, and Histogram Gradient Boosting as seen below." ], "metadata": { "id": "v2R8F5H73vuc" @@ -307,22 +260,24 @@ "cell_type": "code", "source": [ "nb = GaussianNB().fit(X_train, y_train)\n", - "score(nb.predict(X_val), name='Naive Bayes')" + "score(nb.predict(X_val), name='Naive Bayes')\n", + "hist = HistGradientBoostingClassifier().fit(X_train, y_train)\n", + "score(hist.predict(X_val), name='Hist. Grad. Boost. Tree')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pVOaQb0T3tyN", - "outputId": "258373fc-4afb-4442-9e5f-e9fb75d743ef" + "outputId": "a38ce6fe-cadd-4f85-d127-423d66c2947b" }, - "execution_count": 7, + "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ - "100%|██████████| 1/1 [00:01<00:00, 1.48s/it]\n" + "100%|██████████| 2/2 [00:02<00:00, 1.00s/it]\n" ] }, { @@ -332,20 +287,67 @@ "\n", "Statistic with its standard error (se)\n", "statistic (se)\n", - "0.9655 (0.0077) <= Random Forest\n", - "0.9435 (0.0099) <= alg-1\n", - "0.8549 (0.0153) <= Naive Bayes" + "0.9759 (0.0068) <= Hist. Grad. Boost. Tree\n", + "0.9720 (0.0076) <= Random Forest\n", + "0.9521 (0.0097) <= alg-1\n", + "0.8266 (0.0159) <= Naive Bayes" ] }, "metadata": {}, - "execution_count": 7 + "execution_count": 17 } ] }, { "cell_type": "markdown", "source": [ - "The final step is to compare the performance of the three classifiers, which can be done with the `Perf.difference` method, as seen next. " + "The performance, its confidence interval (5\\%), and a statistical comparison (5\\%) between the best performing system with the rest of the algorithms is depicted in the following figure." + ], + "metadata": { + "id": "EY2NX9twUWjQ" + } + }, + { + "cell_type": "code", + "source": [ + "score.plot()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 523 + }, + "id": "5hxPgZY-UXCc", + "outputId": "068f61bb-e52a-48b3-a0bb-d37eab65b299" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 18 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The final step is to compare the performance of the four classifiers, which can be done with the `Perf.difference` method, as seen next. " ], "metadata": { "id": "VRqHiUXN32ZX" @@ -362,22 +364,23 @@ "base_uri": "https://localhost:8080/" }, "id": "XWAqUpYE3za2", - "outputId": "3f108864-6a1f-41bf-dda3-ccab294444e5" + "outputId": "2804fcd4-d766-4de5-ee27-276d4d7f3324" }, - "execution_count": 8, + "execution_count": 22, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\n", - "difference p-values w.r.t Random Forest\n", + "difference p-values w.r.t Hist. Grad. Boost. Tree\n", "0.0000 <= Naive Bayes\n", - "0.0120 <= alg-1" + "0.0100 <= alg-1\n", + "0.3240 <= Random Forest" ] }, "metadata": {}, - "execution_count": 8 + "execution_count": 22 } ] }, @@ -398,34 +401,33 @@ "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 546 + "height": 529 }, "id": "Fai01O3q3-SN", - "outputId": "eae94c83-bac5-473d-b708-23f549b27b07" + "outputId": "a5aaacce-74fc-42dc-86dc-09512421ffbe" }, - "execution_count": 9, + "execution_count": 23, "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 9 - }, { "output_type": "display_data", "data": { "text/plain": [ - "
" + "
" ], - "image/png": "\n" + "image/png": "\n" }, "metadata": {} } ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "VI3JFKudVYNI" + }, + "execution_count": null, + "outputs": [] } ] } \ No newline at end of file diff --git a/docs/source/digits_difference.png b/docs/source/digits_difference.png index 608e0b6..74cee9d 100644 Binary files a/docs/source/digits_difference.png and b/docs/source/digits_difference.png differ diff --git a/docs/source/digits_perf.png b/docs/source/digits_perf.png new file mode 100644 index 0000000..08ce001 Binary files /dev/null and b/docs/source/digits_perf.png differ diff --git a/docs/source/metrics_api.rst b/docs/source/metrics_api.rst index f6f376b..4a600a6 100644 --- a/docs/source/metrics_api.rst +++ b/docs/source/metrics_api.rst @@ -27,7 +27,7 @@ :py:mod:`CompStats.metrics` aims to facilitate performance measurement (with standard errors and confidence intervals) and statistical comparisons between algorithms on a single problem, wrapping the different scores and loss functions found on :py:mod:`~sklearn.metrics`. -To illustrate the use of :py:mod:`CompStats.metrics`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), three different classifiers, and the last line is the score used to measure the performance and compare the algorithm. +To illustrate the use of :py:mod:`CompStats.metrics`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), four different classifiers, and the last line is the score used to measure the performance and compare the algorithm. >>> from sklearn.svm import LinearSVC >>> from sklearn.naive_bayes import GaussianNB @@ -49,45 +49,52 @@ Once the predictions are available, it is time to measure the algorithm's perfor >>> score = f1_score(y_val, hy, average='macro') >>> score - -Statistic with its standard error (se) -statistic (se) -0.9332 (0.0113) <= alg-1 + -The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the :py:func:`~CompStats.interface.Perf.statistic` function. +The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the attributes :py:func:`~CompStats.interface.Perf.statistic` and :py:func:`~CompStats.interface.Perf.se` ->>> score.statistic -{'alg-1': 0.9332035615949114} +>>> score.statistic, score.se +(0.9521479775366307, 0.009717884979482313) Continuing with the example, let us assume that one wants to test another classifier on the same problem, in this case, a random forest, as can be seen in the following two lines. The second line predicts the validation set and sets it to the analysis. >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> score(ens.predict(X_val), name='Random Forest') - + Statistic with its standard error (se) statistic (se) -0.9756 (0.0061) <= Random Forest -0.9332 (0.0113) <= alg-1 +0.9720 (0.0076) <= Random Forest +0.9521 (0.0097) <= alg-1 -Let us incorporate another prediction, now with the Naive Bayes classifier, as seen below. +Let us incorporate another predictions, now with Naive Bayes classifier, and Histogram Gradient Boosting as seen below. >>> nb = GaussianNB().fit(X_train, y_train) >>> score(nb.predict(X_val), name='Naive Bayes') - +>>> hist = HistGradientBoostingClassifier().fit(X_train, y_train) +>>> score(hist.predict(X_val), name='Hist. Grad. Boost. Tree') + Statistic with its standard error (se) statistic (se) -0.9756 (0.0061) <= Random Forest -0.9332 (0.0113) <= alg-1 -0.8198 (0.0144) <= Naive Bayes +0.9759 (0.0068) <= Hist. Grad. Boost. Tree +0.9720 (0.0076) <= Random Forest +0.9521 (0.0097) <= alg-1 +0.8266 (0.0159) <= Naive Bayes + +The performance, its confidence interval (5%), and a statistical comparison (5%) between the best performing system with the rest of the algorithms is depicted in the following figure. + +>>> score.plot() + +.. image:: digits_perf.png -The final step is to compare the performance of the three classifiers, which can be done with the :py:func:`~CompStats.interface.Perf.difference` method, as seen next. +The final step is to compare the performance of the four classifiers, which can be done with the :py:func:`~CompStats.interface.Perf.difference` method, as seen next. >>> diff = score.difference() >>> diff -difference p-values w.r.t Random Forest -0.0000 <= alg-1 +difference p-values w.r.t Hist. Grad. Boost. Tree 0.0000 <= Naive Bayes +0.0100 <= alg-1 +0.3240 <= Random Forest The class :py:class:`~CompStats.Difference` has the :py:class:`~CompStats.Difference.plot` method that can be used to depict the difference with respectto the best.