Skip to content

Commit 03df57a

Browse files
change data processing and regularization so that models end up converging (#190)
1 parent d2124b6 commit 03df57a

File tree

3 files changed

+52
-8
lines changed

3 files changed

+52
-8
lines changed

configs/regular/logreg.json

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,18 @@
2020
"n_classes": [2, 5],
2121
"n_informative": "[SPECIAL_VALUE]0.6",
2222
"class_sep": 1.0
23-
},
23+
}
24+
],
25+
"split_kwargs": {
26+
"train_size": 0.05,
27+
"test_size": 0.95
28+
}
29+
}
30+
},
31+
{
32+
"data": {
33+
"source": "make_classification",
34+
"generation_kwargs": [
2435
{
2536
"n_samples": 1000000,
2637
"n_features": 500,
@@ -33,12 +44,41 @@
3344
"train_size": 0.05,
3445
"test_size": 0.95
3546
}
36-
}
47+
},
48+
"algorithm": {"estimator_params": {"C": 1e-6}}
3749
},
38-
{ "data": { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } } },
39-
{ "data": { "dataset": ["susy", "hepmass"], "split_kwargs": { "train_size": 0.1, "test_size": null } } },
40-
{ "data": { "dataset": "cifar", "split_kwargs": { "train_size": 0.1, "test_size": null } } },
41-
{ "data": { "dataset": "gisette", "split_kwargs": { "train_size": 2000, "test_size": null } } }
50+
{
51+
"data": { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } },
52+
"algorithm": {"estimator_params": {"C": 1e-8}}
53+
},
54+
{
55+
"data": {
56+
"dataset": "susy",
57+
"split_kwargs": { "train_size": 0.1, "test_size": null }
58+
},
59+
"algorithm": { "estimator_params": {"C": 1e-2} }
60+
},
61+
{
62+
"data": {
63+
"dataset": "hepmass",
64+
"split_kwargs": { "train_size": 0.1, "test_size": null }
65+
},
66+
"algorithm": { "estimator_params": {"C": 1e-5} }
67+
},
68+
{
69+
"data": {
70+
"dataset": "cifar",
71+
"split_kwargs": { "train_size": 0.1, "test_size": null }
72+
},
73+
"algorithm": { "estimator_params": {"C": 1e-9} }
74+
},
75+
{
76+
"data": {
77+
"dataset": "gisette",
78+
"split_kwargs": { "train_size": 2000, "test_size": null }
79+
},
80+
"algorithm": { "estimator_params": {"C": 1e1} }
81+
}
4282
]
4383
},
4484
"TEMPLATES": {

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
[tool.black]
1919
line-length = 90
20-
target-version = ['py39', 'py310', 'py311', 'py312']
20+
target-version = ['py39', 'py310', 'py311', 'py312', 'py313']
2121
extend-ignore = 'E203'
2222

2323
[tool.isort]

sklbench/datasets/loaders.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
make_moons,
3131
make_regression,
3232
)
33+
from sklearn.preprocessing import StandardScaler
3334

3435
from .common import cache, load_data_description, load_data_from_cache, preprocess
3536
from .downloaders import download_and_read_csv, load_openml, retrieve
@@ -198,7 +199,7 @@ def load_hepmass(
198199
data = pd.concat([train_data, test_data])
199200
label = data.columns[0]
200201
y = data[label]
201-
x = data.drop(columns=[label])
202+
x = data.drop(columns=[label, "mass"])
202203

203204
data_desc = {
204205
"n_classes": 2,
@@ -418,6 +419,8 @@ def convert_y(y, n_samples):
418419
x = np.vstack([x_train, x_test])
419420
y = np.hstack([y_train, y_test])
420421

422+
x = StandardScaler(with_mean=True, with_std=True).fit_transform(x)
423+
421424
data_desc = {
422425
"n_classes": 2,
423426
"default_split": {
@@ -555,6 +558,7 @@ def load_cifar(
555558
Classification task. n_classes = 10.
556559
"""
557560
x, y = load_openml(40927, raw_data_cache)
561+
x = StandardScaler(with_mean=True, with_std=False).fit_transform(x)
558562
binary = dataset_params.get("binary", False)
559563
if binary:
560564
y = (y > 0).astype(int)

0 commit comments

Comments
 (0)