{ "data_id": "44510", "name": "APSFailure_seed_2_nrows_2000_nclasses_10_ncols_100_stratify_True", "exact_name": "APSFailure_seed_2_nrows_2000_nclasses_10_ncols_100_stratify_True", "version": 1, "version_label": "e959b63e-7610-44d8-9a70-154271467d7f", "description": "Subsampling of the dataset APSFailure (41138) with\n\nseed=2\nargs.nrows=2000\nargs.ncols=100\nargs.nclasses=10\nargs.no_stratify=True\nGenerated with the following source code:\n\n\n```python\n def subsample(\n self,\n seed: int,\n nrows_max: int = 2_000,\n ncols_max: int = 100,\n nclasses_max: int = 10,\n stratified: bool = True,\n ) -> Dataset:\n rng = np.random.default_rng(seed)\n\n x = self.x\n y = self.y\n\n # Uniformly sample\n classes = y.unique()\n if len(classes) > nclasses_max:\n vcs = y.value_counts()\n selected_classes = rng.choice(\n classes,\n size=nclasses_max,\n replace=False,\n p=vcs \/ sum(vcs),\n )\n\n # Select the indices where one of these classes is present\n idxs = y.index[y.isin(classes)]\n x = x.iloc[idxs]\n y = y.iloc[idxs]\n\n # Uniformly sample columns if required\n if len(x.columns) > ncols_max:\n columns_idxs = rng.choice(\n list(range(len(x.columns))), size=ncols_max, replace=False\n )\n sorted_column_idxs = sorted(columns_idxs)\n selected_columns = list(x.columns[sorted_column_idxs])\n x = x[selected_columns]\n else:\n sorted_column_idxs = list(range(len(x.columns)))\n\n if len(x) > nrows_max:\n # Stratify accordingly\n target_name = y.name\n data = pd.concat((x, y), axis=\"columns\")\n _, subset = train_test_split(\n data,\n test_size=nrows_max,\n stratify=data[target_name],\n shuffle=True,\n random_state=seed,\n )\n x = subset.drop(target_name, axis=\"columns\")\n y = subset[target_name]\n\n # We need to convert categorical columns to string for openml\n categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]\n columns = list(x.columns)\n\n return Dataset(\n # Technically this is not the same but it's where it was derived from\n dataset=self.dataset,\n x=x,\n y=y,\n categorical_mask=categorical_mask,\n columns=columns,\n )\n```", "format": "arff", "uploader": "Eddie Bergman", "uploader_id": 32840, "visibility": "public", "creator": "\"Eddie Bergman\"", "contributor": null, "date": "2022-11-17 18:24:11", "update_comment": null, "last_update": "2022-11-17 18:24:11", "licence": "CC0", "status": "active", "error_message": null, "url": "https:\/\/api.openml.org\/data\/download\/22111272\/dataset", "default_target_attribute": "class", "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "APSFailure_seed_2_nrows_2000_nclasses_10_ncols_100_stratify_True", "Subsampling of the dataset APSFailure (41138) with seed=2 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code: ```python def subsample( self, seed: int, nrows_max: int = 2_000, ncols_max: int = 100, nclasses_max: int = 10, stratified: bool = True, ) -> Dataset: rng = np.random.default_rng(seed) x = self.x y = self.y # Uniformly sample classes = y.unique() if len(classes) > nclasses_max: vcs = y.value_counts() selected_classes = rng.choic " ], "weight": 5 }, "qualities": { "NumberOfInstances": 2000, "NumberOfFeatures": 101, "NumberOfClasses": 2, "NumberOfMissingValues": 17956, "NumberOfInstancesWithMissingValues": 1985, "NumberOfNumericFeatures": 100, "NumberOfSymbolicFeatures": 1, "PercentageOfBinaryFeatures": 0.9900990099009901, "PercentageOfInstancesWithMissingValues": 99.25, "AutoCorrelation": 0.9649824912456229, "PercentageOfMissingValues": 8.889108910891089, "Dimensionality": 0.0505, "PercentageOfNumericFeatures": 99.00990099009901, "MajorityClassPercentage": 98.2, "PercentageOfSymbolicFeatures": 0.9900990099009901, "MajorityClassSize": 1964, "MinorityClassPercentage": 1.7999999999999998, "MinorityClassSize": 36, "NumberOfBinaryFeatures": 1 }, "tags": [ { "uploader": "38960", "tag": "Machine Learning" }, { "uploader": "38960", "tag": "Mathematics" } ], "features": [ { "name": "class", "index": "100", "type": "nominal", "distinct": "2", "missing": "0", "target": "1", "distr": [ [ "pos", "neg" ], [ [ "36", "0" ], [ "0", "1964" ] ] ] }, { "name": "ae_000", "index": "0", "type": "numeric", "distinct": "38", "missing": "80", "min": "0", "max": "1012", "mean": "4", "stdev": "45" }, { "name": "ag_000", "index": "1", "type": "numeric", "distinct": "3", "missing": "14", "min": "0", "max": "2664", "mean": "1", "stdev": "60" }, { "name": "ag_001", "index": "2", "type": "numeric", "distinct": "26", "missing": "14", "min": "0", "max": "135062", "mean": "200", "stdev": "3893" }, { "name": "ag_003", "index": "3", "type": "numeric", "distinct": "413", "missing": "14", "min": "0", "max": "14044790", "mean": "79182", "stdev": "563029" }, { "name": "ag_005", "index": "4", "type": "numeric", "distinct": "1887", "missing": "14", "min": "0", "max": "92749198", "mean": "1170114", "stdev": "3748003" }, { "name": "ag_007", "index": "5", "type": "numeric", "distinct": "1437", "missing": "14", "min": "0", "max": "34047362", "mean": "517143", "stdev": "1672656" }, { "name": "ag_008", "index": "6", "type": "numeric", "distinct": "1097", "missing": "14", "min": "0", "max": "4676970", "mean": "33585", "stdev": "167826" }, { "name": "ag_009", "index": "7", "type": "numeric", "distinct": "528", "missing": "14", "min": "0", "max": "610006", "mean": "2483", "stdev": "20508" }, { "name": "ah_000", "index": "8", "type": "numeric", "distinct": "1958", "missing": "12", "min": "0", "max": "82073576", "mean": "1851105", "stdev": "4656677" }, { "name": "ai_000", "index": "9", "type": "numeric", "distinct": "192", "missing": "11", "min": "0", "max": "3513716", "mean": "7074", "stdev": "89810" }, { "name": "aj_000", "index": "10", "type": "numeric", "distinct": "177", "missing": "11", "min": "0", "max": "536264", "mean": "592", "stdev": "13854" }, { "name": "al_000", "index": "11", "type": "numeric", "distinct": "682", "missing": "12", "min": "0", "max": "19605940", "mean": "58176", "stdev": "535706" }, { "name": "an_000", "index": "12", "type": "numeric", "distinct": "1971", "missing": "12", "min": "0", "max": "149868570", "mean": "3541727", "stdev": "8541221" }, { "name": "ap_000", "index": "13", "type": "numeric", "distinct": "1968", "missing": "12", "min": "0", "max": "84129814", "mean": "1075698", "stdev": "4085425" }, { "name": "ar_000", "index": "14", "type": "numeric", "distinct": "18", "missing": "88", "min": "0", "max": "218", "mean": "1", "stdev": "7" }, { "name": "as_000", "index": "15", "type": "numeric", "distinct": "1", "missing": "11", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "ax_000", "index": "16", "type": "numeric", "distinct": "440", "missing": "80", "min": "0", "max": "53572", "mean": "385", "stdev": "1903" }, { "name": "ay_002", "index": "17", "type": "numeric", "distinct": "36", "missing": "14", "min": "0", "max": "4150414", "mean": "4924", "stdev": "112042" }, { "name": "ay_003", "index": "18", "type": "numeric", "distinct": "38", "missing": "14", "min": "0", "max": "7258890", "mean": "9398", "stdev": "223780" }, { "name": "ay_004", "index": "19", "type": "numeric", "distinct": "65", "missing": "14", "min": "0", "max": "8420768", "mean": "6961", "stdev": "200364" }, { "name": "ay_006", "index": "20", "type": "numeric", "distinct": "1414", "missing": "14", "min": "0", "max": "88779936", "mean": "1221830", "stdev": "4550671" }, { "name": "ay_008", "index": "21", "type": "numeric", "distinct": "1758", "missing": "14", "min": "0", "max": "198588262", "mean": "1117431", "stdev": "5566926" }, { "name": "ay_009", "index": "22", "type": "numeric", "distinct": "17", "missing": "14", "min": "0", "max": "967378", "mean": "527", "stdev": "21724" }, { "name": "az_000", "index": "23", "type": "numeric", "distinct": "1414", "missing": "14", "min": "0", "max": "1139676", "mean": "7483", "stdev": "36824" }, { "name": "az_001", "index": "24", "type": "numeric", "distinct": "1050", "missing": "14", "min": "0", "max": "1721850", "mean": "5497", "stdev": "50294" }, { "name": "az_006", "index": "25", "type": "numeric", "distinct": "1031", "missing": "14", "min": "0", "max": "9981748", "mean": "89470", "stdev": "563053" }, { "name": "az_009", "index": "26", "type": "numeric", "distinct": "28", "missing": "14", "min": "0", "max": "13628", "mean": "12", "stdev": "317" }, { "name": "ba_002", "index": "27", "type": "numeric", "distinct": "1786", "missing": "14", "min": "0", "max": "23946422", "mean": "437331", "stdev": "1309878" }, { "name": "ba_005", "index": "28", "type": "numeric", "distinct": "1588", "missing": "14", "min": "0", "max": "7639360", "mean": "191663", "stdev": "477051" }, { "name": "ba_006", "index": "29", "type": "numeric", "distinct": "1548", "missing": "14", "min": "0", "max": "12885214", "mean": "213070", "stdev": "635972" }, { "name": "ba_007", "index": "30", "type": "numeric", "distinct": "1343", "missing": "14", "min": "0", "max": "6624664", "mean": "181277", "stdev": "497109" }, { "name": "ba_009", "index": "31", "type": "numeric", "distinct": "420", "missing": "14", "min": "0", "max": "4824252", "mean": "38719", "stdev": "289309" }, { "name": "bc_000", "index": "32", "type": "numeric", "distinct": "383", "missing": "88", "min": "0", "max": "105870", "mean": "501", "stdev": "3248" }, { "name": "bd_000", "index": "33", "type": "numeric", "distinct": "559", "missing": "88", "min": "0", "max": "105362", "mean": "796", "stdev": "4241" }, { "name": "be_000", "index": "34", "type": "numeric", "distinct": "675", "missing": "80", "min": "0", "max": "162952", "mean": "1302", "stdev": "7280" }, { "name": "bg_000", "index": "35", "type": "numeric", "distinct": "1958", "missing": "12", "min": "0", "max": "82073576", "mean": "1854378", "stdev": "4659525" }, { "name": "bh_000", "index": "36", "type": "numeric", "distinct": "1699", "missing": "12", "min": "0", "max": "3868624", "mean": "59585", "stdev": "177648" }, { "name": "bi_000", "index": "37", "type": "numeric", "distinct": "1959", "missing": "10", "min": "0", "max": "32116248", "mean": "516844", "stdev": "1677406" }, { "name": "bj_000", "index": "38", "type": "numeric", "distinct": "1941", "missing": "10", "min": "0", "max": "64370192", "mean": "556153", "stdev": "2665511" }, { "name": "bk_000", "index": "39", "type": "numeric", "distinct": "1096", "missing": "770", "min": "0", "max": "1310700", "mean": "280740", "stdev": "261842" }, { "name": "bl_000", "index": "40", "type": "numeric", "distinct": "922", "missing": "914", "min": "0", "max": "1310700", "mean": "327069", "stdev": "328775" }, { "name": "bm_000", "index": "41", "type": "numeric", "distinct": "536", "missing": "1307", "min": "0", "max": "1310700", "mean": "402181", "stdev": "410164" }, { "name": "bn_000", "index": "42", "type": "numeric", "distinct": "365", "missing": "1461", "min": "0", "max": "1310700", "mean": "467522", "stdev": "469952" }, { "name": "bq_000", "index": "43", "type": "numeric", "distinct": "196", "missing": "1608", "min": "0", "max": "1310700", "mean": "580377", "stdev": "540645" }, { "name": "br_000", "index": "44", "type": "numeric", "distinct": "166", "missing": "1634", "min": "0", "max": "1310700", "mean": "609734", "stdev": "553319" }, { "name": "bs_000", "index": "45", "type": "numeric", "distinct": "1730", "missing": "18", "min": "0", "max": "881260", "mean": "79932", "stdev": "83318" }, { "name": "bu_000", "index": "46", "type": "numeric", "distinct": "1973", "missing": "12", "min": "0", "max": "234981844", "mean": "4658669", "stdev": "12605381" }, { "name": "bv_000", "index": "47", "type": "numeric", "distinct": "1973", "missing": "12", "min": "0", "max": "234981844", "mean": "4658669", "stdev": "12605381" }, { "name": "by_000", "index": "48", "type": "numeric", "distinct": "1583", "missing": "8", "min": "0", "max": "663120", "mean": "22348", "stdev": "54453" }, { "name": "bz_000", "index": "49", "type": "numeric", "distinct": "1171", "missing": "88", "min": "0", "max": "13056068", "mean": "111731", "stdev": "668999" }, { "name": "ca_000", "index": "50", "type": "numeric", "distinct": "1782", "missing": "146", "min": "0", "max": "120944", "mean": "40290", "stdev": "37148" }, { "name": "cb_000", "index": "51", "type": "numeric", "distinct": "1926", "missing": "18", "min": "0", "max": "1208460", "mean": "409453", "stdev": "369885" }, { "name": "cc_000", "index": "52", "type": "numeric", "distinct": "1847", "missing": "109", "min": "0", "max": "226701036", "mean": "3937793", "stdev": "11157846" }, { "name": "cd_000", "index": "53", "type": "numeric", "distinct": "1", "missing": "13", "min": "1209600", "max": "1209600", "mean": "1209600", "stdev": "0" }, { "name": "ce_000", "index": "54", "type": "numeric", "distinct": "1394", "missing": "80", "min": "0", "max": "2763266", "mean": "64201", "stdev": "142434" }, { "name": "cf_000", "index": "55", "type": "numeric", "distinct": "50", "missing": "488", "min": "0", "max": "140934", "mean": "132", "stdev": "3728" }, { "name": "cg_000", "index": "56", "type": "numeric", "distinct": "183", "missing": "488", "min": "0", "max": "11510", "mean": "98", "stdev": "440" }, { "name": "ch_000", "index": "57", "type": "numeric", "distinct": "1", "missing": "488", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "ci_000", "index": "58", "type": "numeric", "distinct": "1973", "missing": "7", "min": "0", "max": "140986130", "mean": "3623257", "stdev": "9456842" }, { "name": "ck_000", "index": "59", "type": "numeric", "distinct": "1970", "missing": "7", "min": "0", "max": "41022310", "mean": "736710", "stdev": "2389187" }, { "name": "cl_000", "index": "60", "type": "numeric", "distinct": "132", "missing": "316", "min": "0", "max": "103424", "mean": "383", "stdev": "4603" }, { "name": "cn_001", "index": "61", "type": "numeric", "distinct": "300", "missing": "14", "min": "0", "max": "6535920", "mean": "17422", "stdev": "178068" }, { "name": "cn_002", "index": "62", "type": "numeric", "distinct": "921", "missing": "14", "min": "0", "max": "19139824", "mean": "149728", "stdev": "839040" }, { "name": "cn_003", "index": "63", "type": "numeric", "distinct": "1839", "missing": "14", "min": "0", "max": "29633044", "mean": "541776", "stdev": "2033987" }, { "name": "cn_004", "index": "64", "type": "numeric", "distinct": "1912", "missing": "14", "min": "0", "max": "75610876", "mean": "1343660", "stdev": "3833619" }, { "name": "cn_008", "index": "65", "type": "numeric", "distinct": "1154", "missing": "14", "min": "0", "max": "3723532", "mean": "19317", "stdev": "172050" }, { "name": "cn_009", "index": "66", "type": "numeric", "distinct": "524", "missing": "14", "min": "0", "max": "5827764", "mean": "6142", "stdev": "135944" }, { "name": "co_000", "index": "67", "type": "numeric", "distinct": "298", "missing": "488", "min": "0", "max": "70624", "mean": "335", "stdev": "2150" }, { "name": "cp_000", "index": "68", "type": "numeric", "distinct": "346", "missing": "88", "min": "0", "max": "496360", "mean": "626", "stdev": "11536" }, { "name": "cr_000", "index": "69", "type": "numeric", "distinct": "6", "missing": "1530", "min": "0", "max": "38078", "mean": "134", "stdev": "2025" }, { "name": "cs_000", "index": "70", "type": "numeric", "distinct": "1511", "missing": "14", "min": "0", "max": "924996", "mean": "6104", "stdev": "23388" }, { "name": "cs_002", "index": "71", "type": "numeric", "distinct": "1530", "missing": "14", "min": "0", "max": "36319024", "mean": "275146", "stdev": "1739777" }, { "name": "cs_003", "index": "72", "type": "numeric", "distinct": "1761", "missing": "14", "min": "0", "max": "60331082", "mean": "370314", "stdev": "1636999" }, { "name": "cs_004", "index": "73", "type": "numeric", "distinct": "1834", "missing": "14", "min": "0", "max": "34092748", "mean": "440923", "stdev": "1854239" }, { "name": "cs_005", "index": "74", "type": "numeric", "distinct": "1919", "missing": "14", "min": "0", "max": "93294288", "mean": "2324860", "stdev": "5978627" }, { "name": "cs_006", "index": "75", "type": "numeric", "distinct": "1907", "missing": "14", "min": "0", "max": "18246078", "mean": "555787", "stdev": "1176770" }, { "name": "cs_008", "index": "76", "type": "numeric", "distinct": "279", "missing": "14", "min": "0", "max": "440748", "mean": "332", "stdev": "9890" }, { "name": "cs_009", "index": "77", "type": "numeric", "distinct": "11", "missing": "14", "min": "0", "max": "402502", "mean": "299", "stdev": "9989" }, { "name": "ct_000", "index": "78", "type": "numeric", "distinct": "605", "missing": "445", "min": "0", "max": "61006", "mean": "725", "stdev": "2458" }, { "name": "cu_000", "index": "79", "type": "numeric", "distinct": "677", "missing": "445", "min": "0", "max": "388920", "mean": "1576", "stdev": "13917" }, { "name": "cv_000", "index": "80", "type": "numeric", "distinct": "1495", "missing": "445", "min": "0", "max": "70830702", "mean": "1997146", "stdev": "4120714" }, { "name": "cy_000", "index": "81", "type": "numeric", "distinct": "97", "missing": "445", "min": "0", "max": "124168", "mean": "275", "stdev": "4265" }, { "name": "cz_000", "index": "82", "type": "numeric", "distinct": "802", "missing": "445", "min": "0", "max": "16266324", "mean": "29798", "stdev": "455587" }, { "name": "da_000", "index": "83", "type": "numeric", "distinct": "23", "missing": "445", "min": "0", "max": "10520", "mean": "12", "stdev": "274" }, { "name": "dc_000", "index": "84", "type": "numeric", "distinct": "1497", "missing": "445", "min": "0", "max": "72983762", "mean": "2321127", "stdev": "4833479" }, { "name": "dd_000", "index": "85", "type": "numeric", "distinct": "1141", "missing": "80", "min": "0", "max": "154908", "mean": "3021", "stdev": "8617" }, { "name": "de_000", "index": "86", "type": "numeric", "distinct": "455", "missing": "88", "min": "0", "max": "67712", "mean": "377", "stdev": "1845" }, { "name": "dg_000", "index": "87", "type": "numeric", "distinct": "46", "missing": "137", "min": "0", "max": "2678356", "mean": "1980", "stdev": "62413" }, { "name": "di_000", "index": "88", "type": "numeric", "distinct": "298", "missing": "137", "min": "0", "max": "8597582", "mean": "29851", "stdev": "319218" }, { "name": "dj_000", "index": "89", "type": "numeric", "distinct": "8", "missing": "137", "min": "0", "max": "1840", "mean": "2", "stdev": "59" }, { "name": "do_000", "index": "90", "type": "numeric", "distinct": "1270", "missing": "88", "min": "0", "max": "1874542", "mean": "30420", "stdev": "87423" }, { "name": "dq_000", "index": "91", "type": "numeric", "distinct": "415", "missing": "88", "min": "0", "max": "245616290", "mean": "1316761", "stdev": "9478539" }, { "name": "dt_000", "index": "92", "type": "numeric", "distinct": "1339", "missing": "88", "min": "0", "max": "603194", "mean": "15687", "stdev": "38393" }, { "name": "du_000", "index": "93", "type": "numeric", "distinct": "1570", "missing": "88", "min": "0", "max": "250471860", "mean": "4198208", "stdev": "12723606" }, { "name": "dx_000", "index": "94", "type": "numeric", "distinct": "565", "missing": "88", "min": "0", "max": "60093966", "mean": "688376", "stdev": "3461790" }, { "name": "eb_000", "index": "95", "type": "numeric", "distinct": "1151", "missing": "137", "min": "0", "max": "1322456920", "mean": "11168313", "stdev": "58963661" }, { "name": "ec_00", "index": "96", "type": "numeric", "distinct": "1563", "missing": "338", "min": "0", "max": "67076", "mean": "1490", "stdev": "4206" }, { "name": "ee_000", "index": "97", "type": "numeric", "distinct": "1924", "missing": "14", "min": "0", "max": "95928594", "mean": "782713", "stdev": "3268314" }, { "name": "ee_004", "index": "98", "type": "numeric", "distinct": "1705", "missing": "14", "min": "0", "max": "27009152", "mean": "437125", "stdev": "1113162" }, { "name": "ef_000", "index": "99", "type": "numeric", "distinct": "4", "missing": "88", "min": "0", "max": "144", "mean": "0", "stdev": "3" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }