{ "data_id": "44512", "name": "APSFailure_seed_4_nrows_2000_nclasses_10_ncols_100_stratify_True", "exact_name": "APSFailure_seed_4_nrows_2000_nclasses_10_ncols_100_stratify_True", "version": 1, "version_label": "4fff356a-e831-4d3e-a37f-46e24402d873", "description": "Subsampling of the dataset APSFailure (41138) with\n\nseed=4\nargs.nrows=2000\nargs.ncols=100\nargs.nclasses=10\nargs.no_stratify=True\nGenerated with the following source code:\n\n\n```python\n def subsample(\n self,\n seed: int,\n nrows_max: int = 2_000,\n ncols_max: int = 100,\n nclasses_max: int = 10,\n stratified: bool = True,\n ) -> Dataset:\n rng = np.random.default_rng(seed)\n\n x = self.x\n y = self.y\n\n # Uniformly sample\n classes = y.unique()\n if len(classes) > nclasses_max:\n vcs = y.value_counts()\n selected_classes = rng.choice(\n classes,\n size=nclasses_max,\n replace=False,\n p=vcs \/ sum(vcs),\n )\n\n # Select the indices where one of these classes is present\n idxs = y.index[y.isin(classes)]\n x = x.iloc[idxs]\n y = y.iloc[idxs]\n\n # Uniformly sample columns if required\n if len(x.columns) > ncols_max:\n columns_idxs = rng.choice(\n list(range(len(x.columns))), size=ncols_max, replace=False\n )\n sorted_column_idxs = sorted(columns_idxs)\n selected_columns = list(x.columns[sorted_column_idxs])\n x = x[selected_columns]\n else:\n sorted_column_idxs = list(range(len(x.columns)))\n\n if len(x) > nrows_max:\n # Stratify accordingly\n target_name = y.name\n data = pd.concat((x, y), axis=\"columns\")\n _, subset = train_test_split(\n data,\n test_size=nrows_max,\n stratify=data[target_name],\n shuffle=True,\n random_state=seed,\n )\n x = subset.drop(target_name, axis=\"columns\")\n y = subset[target_name]\n\n # We need to convert categorical columns to string for openml\n categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]\n columns = list(x.columns)\n\n return Dataset(\n # Technically this is not the same but it's where it was derived from\n dataset=self.dataset,\n x=x,\n y=y,\n categorical_mask=categorical_mask,\n columns=columns,\n )\n```", "format": "arff", "uploader": "Eddie Bergman", "uploader_id": 32840, "visibility": "public", "creator": "\"Eddie Bergman\"", "contributor": null, "date": "2022-11-17 18:24:21", "update_comment": null, "last_update": "2022-11-17 18:24:21", "licence": "CC0", "status": "active", "error_message": null, "url": "https:\/\/api.openml.org\/data\/download\/22111274\/dataset", "default_target_attribute": "class", "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "APSFailure_seed_4_nrows_2000_nclasses_10_ncols_100_stratify_True", "Subsampling of the dataset APSFailure (41138) with seed=4 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code: ```python def subsample( self, seed: int, nrows_max: int = 2_000, ncols_max: int = 100, nclasses_max: int = 10, stratified: bool = True, ) -> Dataset: rng = np.random.default_rng(seed) x = self.x y = self.y # Uniformly sample classes = y.unique() if len(classes) > nclasses_max: vcs = y.value_counts() selected_classes = rng.choic " ], "weight": 5 }, "qualities": { "NumberOfInstances": 2000, "NumberOfFeatures": 101, "NumberOfClasses": 2, "NumberOfMissingValues": 19156, "NumberOfInstancesWithMissingValues": 1975, "NumberOfNumericFeatures": 100, "NumberOfSymbolicFeatures": 1, "MinorityClassSize": 36, "NumberOfBinaryFeatures": 1, "PercentageOfBinaryFeatures": 0.9900990099009901, "PercentageOfInstancesWithMissingValues": 98.75, "AutoCorrelation": 0.9649824912456229, "PercentageOfMissingValues": 9.483168316831684, "Dimensionality": 0.0505, "PercentageOfNumericFeatures": 99.00990099009901, "MajorityClassPercentage": 98.2, "PercentageOfSymbolicFeatures": 0.9900990099009901, "MajorityClassSize": 1964, "MinorityClassPercentage": 1.7999999999999998 }, "tags": [ { "uploader": "38960", "tag": "Machine Learning" }, { "uploader": "38960", "tag": "Mathematics" } ], "features": [ { "name": "class", "index": "100", "type": "nominal", "distinct": "2", "missing": "0", "target": "1", "distr": [ [ "pos", "neg" ], [ [ "36", "0" ], [ "0", "1964" ] ] ] }, { "name": "ad_000", "index": "0", "type": "numeric", "distinct": "503", "missing": "495", "min": "0", "max": "9262", "mean": "405", "stdev": "826" }, { "name": "af_000", "index": "1", "type": "numeric", "distinct": "35", "missing": "79", "min": "0", "max": "5814", "mean": "11", "stdev": "187" }, { "name": "ag_000", "index": "2", "type": "numeric", "distinct": "6", "missing": "20", "min": "0", "max": "536290", "mean": "283", "stdev": "12059" }, { "name": "ag_004", "index": "3", "type": "numeric", "distinct": "1497", "missing": "20", "min": "0", "max": "27963266", "mean": "402803", "stdev": "1988923" }, { "name": "ag_005", "index": "4", "type": "numeric", "distinct": "1875", "missing": "20", "min": "0", "max": "43294646", "mean": "1037475", "stdev": "2954923" }, { "name": "ag_007", "index": "5", "type": "numeric", "distinct": "1420", "missing": "20", "min": "0", "max": "32056506", "mean": "477319", "stdev": "1371280" }, { "name": "ag_009", "index": "6", "type": "numeric", "distinct": "534", "missing": "20", "min": "0", "max": "25198514", "mean": "14895", "stdev": "566361" }, { "name": "ah_000", "index": "7", "type": "numeric", "distinct": "1948", "missing": "21", "min": "0", "max": "46926296", "mean": "1734543", "stdev": "4122972" }, { "name": "ak_000", "index": "8", "type": "numeric", "distinct": "10", "missing": "143", "min": "0", "max": "920820", "mean": "636", "stdev": "22027" }, { "name": "am_0", "index": "9", "type": "numeric", "distinct": "671", "missing": "17", "min": "0", "max": "11532572", "mean": "83718", "stdev": "613731" }, { "name": "an_000", "index": "10", "type": "numeric", "distinct": "1963", "missing": "21", "min": "0", "max": "83942092", "mean": "3350024", "stdev": "7763984" }, { "name": "ao_000", "index": "11", "type": "numeric", "distinct": "1964", "missing": "16", "min": "0", "max": "75366796", "mean": "2896652", "stdev": "6729324" }, { "name": "aq_000", "index": "12", "type": "numeric", "distinct": "1892", "missing": "16", "min": "0", "max": "18795350", "mean": "426376", "stdev": "1273898" }, { "name": "as_000", "index": "13", "type": "numeric", "distinct": "3", "missing": "17", "min": "0", "max": "1246190", "mean": "1213", "stdev": "38214" }, { "name": "au_000", "index": "14", "type": "numeric", "distinct": "5", "missing": "17", "min": "0", "max": "2626676", "mean": "1690", "stdev": "61154" }, { "name": "av_000", "index": "15", "type": "numeric", "distinct": "676", "missing": "80", "min": "0", "max": "188124", "mean": "1093", "stdev": "5806" }, { "name": "ay_000", "index": "16", "type": "numeric", "distinct": "16", "missing": "20", "min": "0", "max": "288166", "mean": "461", "stdev": "8653" }, { "name": "ay_001", "index": "17", "type": "numeric", "distinct": "37", "missing": "20", "min": "0", "max": "619436", "mean": "595", "stdev": "15337" }, { "name": "ay_003", "index": "18", "type": "numeric", "distinct": "41", "missing": "20", "min": "0", "max": "11439526", "mean": "6452", "stdev": "257629" }, { "name": "ay_004", "index": "19", "type": "numeric", "distinct": "62", "missing": "20", "min": "0", "max": "14266356", "mean": "11120", "stdev": "343068" }, { "name": "ay_005", "index": "20", "type": "numeric", "distinct": "957", "missing": "20", "min": "0", "max": "34320190", "mean": "98983", "stdev": "1156274" }, { "name": "ay_009", "index": "21", "type": "numeric", "distinct": "20", "missing": "20", "min": "0", "max": "182060", "mean": "255", "stdev": "5454" }, { "name": "az_002", "index": "22", "type": "numeric", "distinct": "1174", "missing": "20", "min": "0", "max": "12088584", "mean": "11919", "stdev": "274366" }, { "name": "az_003", "index": "23", "type": "numeric", "distinct": "1488", "missing": "20", "min": "0", "max": "29346242", "mean": "114009", "stdev": "1046211" }, { "name": "az_004", "index": "24", "type": "numeric", "distinct": "1768", "missing": "20", "min": "0", "max": "57117356", "mean": "1371708", "stdev": "4000839" }, { "name": "az_006", "index": "25", "type": "numeric", "distinct": "1050", "missing": "20", "min": "0", "max": "23399636", "mean": "115859", "stdev": "981755" }, { "name": "az_007", "index": "26", "type": "numeric", "distinct": "213", "missing": "20", "min": "0", "max": "9534824", "mean": "16289", "stdev": "243111" }, { "name": "az_008", "index": "27", "type": "numeric", "distinct": "76", "missing": "20", "min": "0", "max": "733982", "mean": "722", "stdev": "17162" }, { "name": "az_009", "index": "28", "type": "numeric", "distinct": "23", "missing": "20", "min": "0", "max": "15154", "mean": "18", "stdev": "437" }, { "name": "ba_000", "index": "29", "type": "numeric", "distinct": "1929", "missing": "20", "min": "0", "max": "57142352", "mean": "1298414", "stdev": "3239948" }, { "name": "ba_002", "index": "30", "type": "numeric", "distinct": "1768", "missing": "20", "min": "0", "max": "16970286", "mean": "390661", "stdev": "1112540" }, { "name": "ba_005", "index": "31", "type": "numeric", "distinct": "1565", "missing": "20", "min": "0", "max": "12283190", "mean": "178090", "stdev": "500986" }, { "name": "ba_006", "index": "32", "type": "numeric", "distinct": "1529", "missing": "20", "min": "0", "max": "14595198", "mean": "201292", "stdev": "620113" }, { "name": "ba_007", "index": "33", "type": "numeric", "distinct": "1326", "missing": "20", "min": "0", "max": "7922058", "mean": "181202", "stdev": "512093" }, { "name": "ba_008", "index": "34", "type": "numeric", "distinct": "732", "missing": "20", "min": "0", "max": "3086468", "mean": "36082", "stdev": "189215" }, { "name": "bb_000", "index": "35", "type": "numeric", "distinct": "1964", "missing": "21", "min": "0", "max": "126355262", "mean": "4390562", "stdev": "10918084" }, { "name": "bc_000", "index": "36", "type": "numeric", "distinct": "388", "missing": "88", "min": "0", "max": "45810", "mean": "474", "stdev": "2277" }, { "name": "bd_000", "index": "37", "type": "numeric", "distinct": "559", "missing": "88", "min": "0", "max": "66142", "mean": "802", "stdev": "3196" }, { "name": "bf_000", "index": "38", "type": "numeric", "distinct": "176", "missing": "79", "min": "0", "max": "8392", "mean": "65", "stdev": "345" }, { "name": "bg_000", "index": "39", "type": "numeric", "distinct": "1947", "missing": "21", "min": "0", "max": "46926296", "mean": "1734734", "stdev": "4123017" }, { "name": "bh_000", "index": "40", "type": "numeric", "distinct": "1669", "missing": "21", "min": "0", "max": "2082350", "mean": "55330", "stdev": "148682" }, { "name": "bj_000", "index": "41", "type": "numeric", "distinct": "1932", "missing": "16", "min": "0", "max": "40750038", "mean": "499099", "stdev": "1904947" }, { "name": "bl_000", "index": "42", "type": "numeric", "distinct": "907", "missing": "920", "min": "0", "max": "1310700", "mean": "337158", "stdev": "338543" }, { "name": "bm_000", "index": "43", "type": "numeric", "distinct": "506", "missing": "1314", "min": "0", "max": "1310700", "mean": "427459", "stdev": "432125" }, { "name": "bn_000", "index": "44", "type": "numeric", "distinct": "341", "missing": "1473", "min": "0", "max": "1310700", "mean": "500051", "stdev": "492155" }, { "name": "bo_000", "index": "45", "type": "numeric", "distinct": "264", "missing": "1543", "min": "0", "max": "1310700", "mean": "558515", "stdev": "521803" }, { "name": "bp_000", "index": "46", "type": "numeric", "distinct": "216", "missing": "1585", "min": "0", "max": "1310700", "mean": "606172", "stdev": "539373" }, { "name": "bq_000", "index": "47", "type": "numeric", "distinct": "178", "missing": "1618", "min": "0", "max": "1310700", "mean": "643213", "stdev": "551880" }, { "name": "br_000", "index": "48", "type": "numeric", "distinct": "160", "missing": "1634", "min": "0", "max": "1310700", "mean": "662659", "stdev": "558228" }, { "name": "bt_000", "index": "49", "type": "numeric", "distinct": "1960", "missing": "4", "min": "0", "max": "2038561", "mean": "56337", "stdev": "140569" }, { "name": "bx_000", "index": "50", "type": "numeric", "distinct": "1886", "missing": "104", "min": "192", "max": "131172688", "mean": "3826933", "stdev": "9650804" }, { "name": "bz_000", "index": "51", "type": "numeric", "distinct": "1180", "missing": "88", "min": "0", "max": "11155250", "mean": "89120", "stdev": "487241" }, { "name": "cb_000", "index": "52", "type": "numeric", "distinct": "1904", "missing": "22", "min": "0", "max": "1209460", "mean": "390420", "stdev": "364161" }, { "name": "cd_000", "index": "53", "type": "numeric", "distinct": "1", "missing": "21", "min": "1209600", "max": "1209600", "mean": "1209600", "stdev": "0" }, { "name": "ce_000", "index": "54", "type": "numeric", "distinct": "1393", "missing": "80", "min": "0", "max": "1786700", "mean": "59138", "stdev": "124104" }, { "name": "cf_000", "index": "55", "type": "numeric", "distinct": "48", "missing": "495", "min": "0", "max": "6692", "mean": "21", "stdev": "263" }, { "name": "ch_000", "index": "56", "type": "numeric", "distinct": "2", "missing": "495", "min": "0", "max": "2", "mean": "0", "stdev": "0" }, { "name": "cj_000", "index": "57", "type": "numeric", "distinct": "358", "missing": "9", "min": "0", "max": "18989890", "mean": "106937", "stdev": "1044392" }, { "name": "cm_000", "index": "58", "type": "numeric", "distinct": "362", "missing": "339", "min": "0", "max": "36116", "mean": "368", "stdev": "1945" }, { "name": "cn_001", "index": "59", "type": "numeric", "distinct": "268", "missing": "20", "min": "0", "max": "3453942", "mean": "20858", "stdev": "172862" }, { "name": "cn_002", "index": "60", "type": "numeric", "distinct": "897", "missing": "20", "min": "0", "max": "15387720", "mean": "154393", "stdev": "911495" }, { "name": "cn_004", "index": "61", "type": "numeric", "distinct": "1915", "missing": "20", "min": "0", "max": "44546640", "mean": "1215317", "stdev": "3123339" }, { "name": "cn_006", "index": "62", "type": "numeric", "distinct": "1656", "missing": "20", "min": "0", "max": "15579888", "mean": "388140", "stdev": "1073627" }, { "name": "cn_008", "index": "63", "type": "numeric", "distinct": "1125", "missing": "20", "min": "0", "max": "2583856", "mean": "14874", "stdev": "110773" }, { "name": "cr_000", "index": "64", "type": "numeric", "distinct": "5", "missing": "1547", "min": "0", "max": "13940", "mean": "41", "stdev": "673" }, { "name": "cs_000", "index": "65", "type": "numeric", "distinct": "1473", "missing": "20", "min": "0", "max": "139514", "mean": "5406", "stdev": "10204" }, { "name": "cs_002", "index": "66", "type": "numeric", "distinct": "1520", "missing": "20", "min": "0", "max": "18068018", "mean": "216325", "stdev": "993462" }, { "name": "cs_004", "index": "67", "type": "numeric", "distinct": "1817", "missing": "20", "min": "0", "max": "35085072", "mean": "413308", "stdev": "1842649" }, { "name": "cs_005", "index": "68", "type": "numeric", "distinct": "1919", "missing": "20", "min": "0", "max": "69216648", "mean": "2111824", "stdev": "5173022" }, { "name": "cs_007", "index": "69", "type": "numeric", "distinct": "1642", "missing": "20", "min": "0", "max": "375364", "mean": "14291", "stdev": "25605" }, { "name": "cs_008", "index": "70", "type": "numeric", "distinct": "269", "missing": "20", "min": "0", "max": "39912", "mean": "130", "stdev": "916" }, { "name": "cs_009", "index": "71", "type": "numeric", "distinct": "11", "missing": "20", "min": "0", "max": "25028", "mean": "13", "stdev": "562" }, { "name": "ct_000", "index": "72", "type": "numeric", "distinct": "607", "missing": "452", "min": "0", "max": "23114", "mean": "674", "stdev": "1535" }, { "name": "cv_000", "index": "73", "type": "numeric", "distinct": "1491", "missing": "452", "min": "0", "max": "68669536", "mean": "1869762", "stdev": "3903157" }, { "name": "cy_000", "index": "74", "type": "numeric", "distinct": "98", "missing": "452", "min": "0", "max": "119406", "mean": "192", "stdev": "4036" }, { "name": "dd_000", "index": "75", "type": "numeric", "distinct": "1088", "missing": "80", "min": "0", "max": "146456", "mean": "2854", "stdev": "8255" }, { "name": "de_000", "index": "76", "type": "numeric", "distinct": "430", "missing": "88", "min": "0", "max": "38532", "mean": "366", "stdev": "1299" }, { "name": "df_000", "index": "77", "type": "numeric", "distinct": "30", "missing": "129", "min": "0", "max": "4075750", "mean": "3426", "stdev": "99885" }, { "name": "dg_000", "index": "78", "type": "numeric", "distinct": "55", "missing": "129", "min": "0", "max": "20880422", "mean": "18905", "stdev": "543190" }, { "name": "dk_000", "index": "79", "type": "numeric", "distinct": "7", "missing": "129", "min": "0", "max": "2950546", "mean": "1940", "stdev": "69139" }, { "name": "dl_000", "index": "80", "type": "numeric", "distinct": "8", "missing": "129", "min": "0", "max": "103858120", "mean": "81151", "stdev": "2641986" }, { "name": "dm_000", "index": "81", "type": "numeric", "distinct": "8", "missing": "129", "min": "0", "max": "14239782", "mean": "12574", "stdev": "368188" }, { "name": "do_000", "index": "82", "type": "numeric", "distinct": "1300", "missing": "88", "min": "0", "max": "1358378", "mean": "27526", "stdev": "62342" }, { "name": "dq_000", "index": "83", "type": "numeric", "distinct": "415", "missing": "88", "min": "0", "max": "384110966", "mean": "1359058", "stdev": "11651900" }, { "name": "ds_000", "index": "84", "type": "numeric", "distinct": "1585", "missing": "88", "min": "0", "max": "3127424", "mean": "83845", "stdev": "192826" }, { "name": "dt_000", "index": "85", "type": "numeric", "distinct": "1343", "missing": "88", "min": "0", "max": "400166", "mean": "14644", "stdev": "32157" }, { "name": "dv_000", "index": "86", "type": "numeric", "distinct": "1618", "missing": "88", "min": "0", "max": "41555112", "mean": "546642", "stdev": "1746263" }, { "name": "dx_000", "index": "87", "type": "numeric", "distinct": "582", "missing": "88", "min": "0", "max": "51038178", "mean": "683203", "stdev": "3254519" }, { "name": "ea_000", "index": "88", "type": "numeric", "distinct": "14", "missing": "88", "min": "0", "max": "1460", "mean": "1", "stdev": "34" }, { "name": "eb_000", "index": "89", "type": "numeric", "distinct": "1138", "missing": "129", "min": "0", "max": "1031945600", "mean": "10438107", "stdev": "51519268" }, { "name": "ec_00", "index": "90", "type": "numeric", "distinct": "1547", "missing": "349", "min": "0", "max": "83520", "mean": "1453", "stdev": "4510" }, { "name": "ed_000", "index": "91", "type": "numeric", "distinct": "841", "missing": "330", "min": "0", "max": "68548", "mean": "1516", "stdev": "4037" }, { "name": "ee_000", "index": "92", "type": "numeric", "distinct": "1918", "missing": "20", "min": "0", "max": "56601752", "mean": "680599", "stdev": "2194244" }, { "name": "ee_001", "index": "93", "type": "numeric", "distinct": "1907", "missing": "20", "min": "0", "max": "29648248", "mean": "727200", "stdev": "2104059" }, { "name": "ee_003", "index": "94", "type": "numeric", "distinct": "1672", "missing": "20", "min": "0", "max": "7510214", "mean": "198606", "stdev": "484608" }, { "name": "ee_004", "index": "95", "type": "numeric", "distinct": "1711", "missing": "20", "min": "0", "max": "20149172", "mean": "422234", "stdev": "1063703" }, { "name": "ee_006", "index": "96", "type": "numeric", "distinct": "1584", "missing": "20", "min": "0", "max": "18657088", "mean": "312151", "stdev": "965139" }, { "name": "ee_008", "index": "97", "type": "numeric", "distinct": "1232", "missing": "20", "min": "0", "max": "14045640", "mean": "137016", "stdev": "517724" }, { "name": "ef_000", "index": "98", "type": "numeric", "distinct": "3", "missing": "88", "min": "0", "max": "4", "mean": "0", "stdev": "0" }, { "name": "eg_000", "index": "99", "type": "numeric", "distinct": "8", "missing": "88", "min": "0", "max": "70", "mean": "0", "stdev": "2" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }