change data processing and regularization so that models end up converging (#190)

david-cortes-intel · web-flow · commit 03df57ad726b · 2025-11-03T16:04:57.000+01:00
diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json
@@ -20,7 +20,18 @@
                             "n_classes": [2, 5],
                             "n_informative": "[SPECIAL_VALUE]0.6",
                             "class_sep": 1.0
-                        },
+                        }
+                    ],
+                    "split_kwargs": {
+                        "train_size": 0.05,
+                        "test_size": 0.95
+                    }
+                }
+            },
+            {
+                "data": {
+                    "source": "make_classification",
+                    "generation_kwargs": [
                         {
                             "n_samples": 1000000,
                             "n_features": 500,
@@ -33,12 +44,41 @@
                         "train_size": 0.05,
                         "test_size": 0.95
                     }
-                }
+                },
+                "algorithm": {"estimator_params": {"C": 1e-6}}
             },
-            { "data": { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } } },
-            { "data": { "dataset": ["susy", "hepmass"], "split_kwargs": { "train_size": 0.1, "test_size": null } } },
-            { "data": { "dataset": "cifar", "split_kwargs": { "train_size": 0.1, "test_size": null } } },
-            { "data": { "dataset": "gisette", "split_kwargs": { "train_size": 2000, "test_size": null } } }
+            {
+                "data": { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } },
+                "algorithm": {"estimator_params": {"C": 1e-8}}
+            },
+            {
+                "data": {
+                    "dataset": "susy",
+                    "split_kwargs": { "train_size": 0.1, "test_size": null }
+                },
+                "algorithm": { "estimator_params": {"C": 1e-2} }
+            },
+            {
+                "data": {
+                    "dataset": "hepmass",
+                    "split_kwargs": { "train_size": 0.1, "test_size": null }
+                },
+                "algorithm": { "estimator_params": {"C": 1e-5} }
+            },
+            {
+                "data": {
+                    "dataset": "cifar",
+                    "split_kwargs": { "train_size": 0.1, "test_size": null }
+                },
+                "algorithm": { "estimator_params": {"C": 1e-9} }
+            },
+            {
+                "data": {
+                    "dataset": "gisette",
+                    "split_kwargs": { "train_size": 2000, "test_size": null }
+                },
+                "algorithm": { "estimator_params": {"C": 1e1} }
+            }
         ]
     },
     "TEMPLATES": {
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@
 
 [tool.black]
 line-length = 90
-target-version = ['py39', 'py310', 'py311', 'py312']
+target-version = ['py39', 'py310', 'py311', 'py312', 'py313']
 extend-ignore = 'E203'
 
 [tool.isort]
diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py
@@ -30,6 +30,7 @@
     make_moons,
     make_regression,
 )
+from sklearn.preprocessing import StandardScaler
 
 from .common import cache, load_data_description, load_data_from_cache, preprocess
 from .downloaders import download_and_read_csv, load_openml, retrieve
@@ -198,7 +199,7 @@ def load_hepmass(
     data = pd.concat([train_data, test_data])
     label = data.columns[0]
     y = data[label]
-    x = data.drop(columns=[label])
+    x = data.drop(columns=[label, "mass"])
 
     data_desc = {
         "n_classes": 2,
@@ -418,6 +419,8 @@ def convert_y(y, n_samples):
     x = np.vstack([x_train, x_test])
     y = np.hstack([y_train, y_test])
 
+    x = StandardScaler(with_mean=True, with_std=True).fit_transform(x)
+
     data_desc = {
         "n_classes": 2,
         "default_split": {
@@ -555,6 +558,7 @@ def load_cifar(
     Classification task. n_classes = 10.
     """
     x, y = load_openml(40927, raw_data_cache)
+    x = StandardScaler(with_mean=True, with_std=False).fit_transform(x)
     binary = dataset_params.get("binary", False)
     if binary:
         y = (y > 0).astype(int)