Subsampling of the dataset adult (1590) with
seed=3
args.nrows=2000
args.ncols=100
args.nclasses=10
args.no_stratify=True
Generated with the following source code:
```python
def subsample(
self,
seed: int,
nrows_max: int = 2_000,
ncols_max: int = 100,
nclasses_max: int = 10,
stratified: bool = True,
) -> Dataset:
rng = np.random.default_rng(seed)
x = self.x
y = self.y
# Uniformly sample
classes = y.unique()
if len(classes) > nclasses_max:
vcs = y.value_counts()
selected_classes = rng.choice(
classes,
size=nclasses_max,
replace=False,
p=vcs / sum(vcs),
)
# Select the indices where one of these classes is present
idxs = y.index[y.isin(classes)]
x = x.iloc[idxs]
y = y.iloc[idxs]
# Uniformly sample columns if required
if len(x.columns) > ncols_max:
columns_idxs = rng.choice(
list(range(len(x.columns))), size=ncols_max, replace=False
)
sorted_column_idxs = sorted(columns_idxs)
selected_columns = list(x.columns[sorted_column_idxs])
x = x[selected_columns]
else:
sorted_column_idxs = list(range(len(x.columns)))
if len(x) > nrows_max:
# Stratify accordingly
target_name = y.name
data = pd.concat((x, y), axis="columns")
_, subset = train_test_split(
data,
test_size=nrows_max,
stratify=data[target_name],
shuffle=True,
random_state=seed,
)
x = subset.drop(target_name, axis="columns")
y = subset[target_name]
# We need to convert categorical columns to string for openml
categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]
columns = list(x.columns)
return Dataset(
# Technically this is not the same but it's where it was derived from
dataset=self.dataset,
x=x,
y=y,
categorical_mask=categorical_mask,
columns=columns,
)
```