From 9dde5b916ef70fc4854bad3ce95d2a4851c0c417 Mon Sep 17 00:00:00 2001
From: Naoise Holohan <51835109+naoise-h@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:51:57 +0000
Subject: [PATCH] [BUG] Fixes for scikit-learn v1.4 (#91)

* Bugfixes for scikit-learn 1.4

* Fixing pycodestyle error

* Updating GitHub action tests

* Skipping LogReg test for older sklearn versions

* Adding Python 3.12 classifier tag

* Tidying DecisionTreeClassifier fix
---
 .github/workflows/code.yml                |  2 +-
 .github/workflows/general.yml             |  2 +-
 .github/workflows/libraries.yml           | 27 ++++++++---------------
 diffprivlib/models/forest.py              | 10 ++++++++-
 diffprivlib/models/logistic_regression.py | 10 ++++++---
 setup.py                                  |  1 +
 tests/models/test_LogisticRegression.py   |  6 +++--
 7 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/code.yml b/.github/workflows/code.yml
index 0d69505..4044d1f 100644
--- a/.github/workflows/code.yml
+++ b/.github/workflows/code.yml
@@ -17,7 +17,7 @@ jobs:
     - uses: actions/checkout@v3
     - uses: actions/setup-python@v4
       with:
-        python-version: '3.10'
+        python-version: '3.11'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/.github/workflows/general.yml b/.github/workflows/general.yml
index a0eeb9c..6aa48da 100644
--- a/.github/workflows/general.yml
+++ b/.github/workflows/general.yml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11']
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
 
     steps:
     - uses: actions/checkout@v3
diff --git a/.github/workflows/libraries.yml b/.github/workflows/libraries.yml
index c3f5def..81b2ecb 100644
--- a/.github/workflows/libraries.yml
+++ b/.github/workflows/libraries.yml
@@ -19,44 +19,35 @@ jobs:
 
       matrix:
         include:
-          - library: numpy
-            version: 1.21.6
-            python-version: '3.10'
-          - library: numpy
-            version: 1.22.4
-            python-version: '3.10'
           - library: numpy
             version: 1.23.5
             python-version: '3.10'
           - library: numpy
             version: 1.24.4
             python-version: '3.11'
+          - library: numpy
+            version: 1.25.2
+            python-version: '3.11'
 
-          - library: scikit-learn
-            version: 0.24.2
-            python-version: 3.9
-          - library: scikit-learn
-            version: 1.0.2
-            python-version: '3.10'
           - library: scikit-learn
             version: 1.1.3
             python-version: '3.10'
           - library: scikit-learn
             version: 1.2.2
             python-version: '3.10'
+          - library: scikit-learn
+            version: 1.3.2
+            python-version: '3.11'
 
-          - library: scipy
-            version: 1.7.3
-            python-version: '3.10'
-          - library: scipy
-            version: 1.8.1
-            python-version: '3.10'
           - library: scipy
             version: 1.9.3
             python-version: '3.11'
           - library: scipy
             version: 1.10.1
             python-version: '3.11'
+          - library: scipy
+            version: 1.11.4
+            python-version: '3.11'
 
           - library: crlibm
             python-version: '3.10'
diff --git a/diffprivlib/models/forest.py b/diffprivlib/models/forest.py
index 042c61c..d3e373b 100644
--- a/diffprivlib/models/forest.py
+++ b/diffprivlib/models/forest.py
@@ -345,7 +345,7 @@ class DecisionTreeClassifier(skDecisionTreeClassifier, DiffprivlibMixin):
         skDecisionTreeClassifier, "max_depth", "random_state")
 
     def __init__(self, max_depth=5, *, epsilon=1, bounds=None, classes=None, random_state=None, accountant=None,
-                 **unused_args):
+                 criterion=None, **unused_args):
         # Todo: Remove when scikit-learn v1.0 is a min requirement
         try:
             super().__init__(  # pylint: disable=unexpected-keyword-arg
@@ -379,6 +379,9 @@ def __init__(self, max_depth=5, *, epsilon=1, bounds=None, classes=None, random_
         self.classes = classes
         self.accountant = BudgetAccountant.load_default(accountant)
 
+        if criterion is not None:
+            unused_args['criterion'] = criterion
+
         self._warn_unused_args(unused_args)
 
     def fit(self, X, y, sample_weight=None, check_input=True):
@@ -448,6 +451,11 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         return self
 
+    def _fit(self, X, y, sample_weight=None, check_input=True, missing_values_in_feature_mask=None):
+        self.fit(X, y, sample_weight=sample_weight, check_input=check_input)
+
+        return self
+
     @property
     def n_features_(self):
         return self.n_features_in_
diff --git a/diffprivlib/models/logistic_regression.py b/diffprivlib/models/logistic_regression.py
index 112ea9f..097cde4 100644
--- a/diffprivlib/models/logistic_regression.py
+++ b/diffprivlib/models/logistic_regression.py
@@ -371,7 +371,7 @@ def _logistic_regression_path(X, y, epsilon, data_norm, pos_class=None, Cs=10, f
         X = check_array(X, accept_sparse='csr', dtype=np.float64, accept_large_sparse=True)
         y = check_array(y, ensure_2d=False, dtype=None)
         check_consistent_length(X, y)
-    _, n_features = X.shape
+    n_samples, n_features = X.shape
 
     classes = np.unique(y)
 
@@ -400,17 +400,21 @@ def _logistic_regression_path(X, y, epsilon, data_norm, pos_class=None, Cs=10, f
 
     if SKL_LOSS_MODULE:
         func = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept).loss_gradient
+        sw_sum = n_samples
     else:
         func = _logistic_loss_and_grad
+        sw_sum = 1
 
     coefs = []
     n_iter = np.zeros(len(Cs), dtype=np.int32)
     for i, C in enumerate(Cs):
-        vector_mech = Vector(epsilon=epsilon, dimension=n_features + int(fit_intercept), alpha=1. / C,
+        l2_reg_strength = 1.0 / (C * sw_sum)
+        vector_mech = Vector(epsilon=epsilon, dimension=n_features + int(fit_intercept), alpha=l2_reg_strength,
                              function_sensitivity=0.25, data_sensitivity=data_norm, random_state=random_state)
         noisy_logistic_loss = vector_mech.randomise(func)
 
-        args = (X, target, sample_weight, 1. / C) if SKL_LOSS_MODULE else (X, target, 1. / C, sample_weight)
+        args = (X, target, sample_weight, l2_reg_strength) if SKL_LOSS_MODULE else (X, target, l2_reg_strength,
+                                                                                    sample_weight)
 
         iprint = [-1, 50, 1, 100, 101][np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
         output_vec, _, info = optimize.fmin_l_bfgs_b(noisy_logistic_loss, output_vec, fprime=None,
diff --git a/setup.py b/setup.py
index 4d9e78a..34ec167 100644
--- a/setup.py
+++ b/setup.py
@@ -63,6 +63,7 @@ def get_version(file_path):
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
         'Topic :: Software Development :: Libraries',
         'Topic :: Software Development :: Libraries :: Python Modules',
         'Topic :: Scientific/Engineering',
diff --git a/tests/models/test_LogisticRegression.py b/tests/models/test_LogisticRegression.py
index 06d4ba1..696bac5 100644
--- a/tests/models/test_LogisticRegression.py
+++ b/tests/models/test_LogisticRegression.py
@@ -1,8 +1,9 @@
 import numpy as np
-from unittest import TestCase
+from unittest import TestCase, skipIf
 
 from diffprivlib.models.logistic_regression import LogisticRegression
 from diffprivlib.utils import PrivacyLeakWarning, DiffprivlibCompatibilityWarning, BudgetError
+from sklearn import __version__ as sklearn_version
 
 
 class TestLogisticRegression(TestCase):
@@ -151,6 +152,7 @@ def test_different_results(self):
 
         self.assertTrue(np.any(predict1 != predict2) or np.any(predict1 != predict3))
 
+    @skipIf(sklearn_version < "1.4", "The penalty was scaled incorrectly in previous versions (Scikit-Learn GH 26721)")
     def test_same_results(self):
         from sklearn import datasets
         from sklearn.model_selection import train_test_split
@@ -180,7 +182,7 @@ def test_simple(self):
         X -= 3.0
         X /= 2.5
 
-        clf = LogisticRegression(epsilon=2, data_norm=1.0, random_state=0)
+        clf = LogisticRegression(epsilon=2, data_norm=1.0, random_state=1)
         clf.fit(X, y)
 
         self.assertIsNotNone(clf)