{ "data_id": "44509", "name": "APSFailure_seed_1_nrows_2000_nclasses_10_ncols_100_stratify_True", "exact_name": "APSFailure_seed_1_nrows_2000_nclasses_10_ncols_100_stratify_True", "version": 1, "version_label": "6345d8cc-14fb-4441-a2fa-eafe8089f041", "description": "Subsampling of the dataset APSFailure (41138) with\n\nseed=1\nargs.nrows=2000\nargs.ncols=100\nargs.nclasses=10\nargs.no_stratify=True\nGenerated with the following source code:\n\n\n```python\n def subsample(\n self,\n seed: int,\n nrows_max: int = 2_000,\n ncols_max: int = 100,\n nclasses_max: int = 10,\n stratified: bool = True,\n ) -> Dataset:\n rng = np.random.default_rng(seed)\n\n x = self.x\n y = self.y\n\n # Uniformly sample\n classes = y.unique()\n if len(classes) > nclasses_max:\n vcs = y.value_counts()\n selected_classes = rng.choice(\n classes,\n size=nclasses_max,\n replace=False,\n p=vcs \/ sum(vcs),\n )\n\n # Select the indices where one of these classes is present\n idxs = y.index[y.isin(classes)]\n x = x.iloc[idxs]\n y = y.iloc[idxs]\n\n # Uniformly sample columns if required\n if len(x.columns) > ncols_max:\n columns_idxs = rng.choice(\n list(range(len(x.columns))), size=ncols_max, replace=False\n )\n sorted_column_idxs = sorted(columns_idxs)\n selected_columns = list(x.columns[sorted_column_idxs])\n x = x[selected_columns]\n else:\n sorted_column_idxs = list(range(len(x.columns)))\n\n if len(x) > nrows_max:\n # Stratify accordingly\n target_name = y.name\n data = pd.concat((x, y), axis=\"columns\")\n _, subset = train_test_split(\n data,\n test_size=nrows_max,\n stratify=data[target_name],\n shuffle=True,\n random_state=seed,\n )\n x = subset.drop(target_name, axis=\"columns\")\n y = subset[target_name]\n\n # We need to convert categorical columns to string for openml\n categorical_mask = [self.categorical_mask[i] for i in sorted_column_idxs]\n columns = list(x.columns)\n\n return Dataset(\n # Technically this is not the same but it's where it was derived from\n dataset=self.dataset,\n x=x,\n y=y,\n categorical_mask=categorical_mask,\n columns=columns,\n )\n```", "format": "arff", "uploader": "Eddie Bergman", "uploader_id": 32840, "visibility": "public", "creator": "\"Eddie Bergman\"", "contributor": null, "date": "2022-11-17 18:24:06", "update_comment": null, "last_update": "2022-11-17 18:24:06", "licence": "CC0", "status": "active", "error_message": null, "url": "https:\/\/api.openml.org\/data\/download\/22111271\/dataset", "kaggle_url": null, "default_target_attribute": "class", "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "APSFailure_seed_1_nrows_2000_nclasses_10_ncols_100_stratify_True", "Subsampling of the dataset APSFailure (41138) with seed=1 args.nrows=2000 args.ncols=100 args.nclasses=10 args.no_stratify=True Generated with the following source code: ```python def subsample( self, seed: int, nrows_max: int = 2_000, ncols_max: int = 100, nclasses_max: int = 10, stratified: bool = True, ) -> Dataset: rng = np.random.default_rng(seed) x = self.x y = self.y # Uniformly sample classes = y.unique() if len(classes) > nclasses_max: vcs = y.value_counts() selected_classes = rng.choic " ], "weight": 5 }, "qualities": { "NumberOfInstances": 2000, "NumberOfFeatures": 101, "NumberOfClasses": 2, "NumberOfMissingValues": 15419, "NumberOfInstancesWithMissingValues": 1984, "NumberOfNumericFeatures": 100, "NumberOfSymbolicFeatures": 1, "PercentageOfBinaryFeatures": 0.9900990099009901, "PercentageOfInstancesWithMissingValues": 99.2, "PercentageOfMissingValues": 7.633168316831683, "AutoCorrelation": 0.9639819909954978, "PercentageOfNumericFeatures": 99.00990099009901, "Dimensionality": 0.0505, "PercentageOfSymbolicFeatures": 0.9900990099009901, "MajorityClassPercentage": 98.2, "MajorityClassSize": 1964, "MinorityClassPercentage": 1.7999999999999998, "MinorityClassSize": 36, "NumberOfBinaryFeatures": 1 }, "tags": [], "features": [ { "name": "class", "index": "100", "type": "nominal", "distinct": "2", "missing": "0", "target": "1", "distr": [ [ "pos", "neg" ], [ [ "36", "0" ], [ "0", "1964" ] ] ] }, { "name": "ac_000", "index": "0", "type": "numeric", "distinct": "489", "missing": "115", "min": "0", "max": "2130706614", "mean": "359451056", "stdev": "798133375" }, { "name": "af_000", "index": "1", "type": "numeric", "distinct": "42", "missing": "87", "min": "0", "max": "2596", "mean": "10", "stdev": "102" }, { "name": "ag_001", "index": "2", "type": "numeric", "distinct": "25", "missing": "19", "min": "0", "max": "215720", "mean": "440", "stdev": "8186" }, { "name": "ag_003", "index": "3", "type": "numeric", "distinct": "415", "missing": "19", "min": "0", "max": "18412354", "mean": "88264", "stdev": "648272" }, { "name": "ag_004", "index": "4", "type": "numeric", "distinct": "1525", "missing": "19", "min": "0", "max": "43533168", "mean": "450015", "stdev": "2270310" }, { "name": "ag_005", "index": "5", "type": "numeric", "distinct": "1886", "missing": "19", "min": "0", "max": "38614924", "mean": "1118031", "stdev": "3144589" }, { "name": "ag_006", "index": "6", "type": "numeric", "distinct": "1878", "missing": "19", "min": "0", "max": "51735544", "mean": "1626876", "stdev": "3707022" }, { "name": "ag_007", "index": "7", "type": "numeric", "distinct": "1389", "missing": "19", "min": "0", "max": "28388532", "mean": "495118", "stdev": "1331318" }, { "name": "ag_009", "index": "8", "type": "numeric", "distinct": "552", "missing": "19", "min": "0", "max": "25198514", "mean": "24241", "stdev": "634653" }, { "name": "ak_000", "index": "9", "type": "numeric", "distinct": "8", "missing": "145", "min": "0", "max": "1241484", "mean": "684", "stdev": "28827" }, { "name": "al_000", "index": "10", "type": "numeric", "distinct": "682", "missing": "24", "min": "0", "max": "19729034", "mean": "58144", "stdev": "542114" }, { "name": "am_0", "index": "11", "type": "numeric", "distinct": "690", "missing": "23", "min": "0", "max": "26447474", "mean": "91287", "stdev": "782909" }, { "name": "an_000", "index": "12", "type": "numeric", "distinct": "1956", "missing": "24", "min": "0", "max": "140861830", "mean": "3500603", "stdev": "8191407" }, { "name": "ap_000", "index": "13", "type": "numeric", "distinct": "1944", "missing": "24", "min": "0", "max": "50150838", "mean": "1070072", "stdev": "3368633" }, { "name": "at_000", "index": "14", "type": "numeric", "distinct": "206", "missing": "23", "min": "0", "max": "6178524", "mean": "5458", "stdev": "140515" }, { "name": "av_000", "index": "15", "type": "numeric", "distinct": "691", "missing": "87", "min": "0", "max": "107824", "mean": "981", "stdev": "3562" }, { "name": "ax_000", "index": "16", "type": "numeric", "distinct": "458", "missing": "87", "min": "0", "max": "17018", "mean": "332", "stdev": "934" }, { "name": "ay_001", "index": "17", "type": "numeric", "distinct": "42", "missing": "19", "min": "0", "max": "10852718", "mean": "9347", "stdev": "266564" }, { "name": "ay_002", "index": "18", "type": "numeric", "distinct": "44", "missing": "19", "min": "0", "max": "8377142", "mean": "6732", "stdev": "196882" }, { "name": "ay_003", "index": "19", "type": "numeric", "distinct": "46", "missing": "19", "min": "0", "max": "4662578", "mean": "5720", "stdev": "127043" }, { "name": "ay_004", "index": "20", "type": "numeric", "distinct": "77", "missing": "19", "min": "0", "max": "2217634", "mean": "4732", "stdev": "83732" }, { "name": "ay_006", "index": "21", "type": "numeric", "distinct": "1394", "missing": "19", "min": "0", "max": "75263630", "mean": "1056790", "stdev": "3129936" }, { "name": "ay_008", "index": "22", "type": "numeric", "distinct": "1761", "missing": "19", "min": "0", "max": "63611514", "mean": "1117423", "stdev": "4420808" }, { "name": "ay_009", "index": "23", "type": "numeric", "distinct": "24", "missing": "19", "min": "0", "max": "123752", "mean": "122", "stdev": "2947" }, { "name": "az_000", "index": "24", "type": "numeric", "distinct": "1404", "missing": "19", "min": "0", "max": "4472566", "mean": "9714", "stdev": "106376" }, { "name": "az_001", "index": "25", "type": "numeric", "distinct": "1060", "missing": "19", "min": "0", "max": "1573086", "mean": "5048", "stdev": "40756" }, { "name": "az_004", "index": "26", "type": "numeric", "distinct": "1767", "missing": "19", "min": "0", "max": "87839714", "mean": "1534425", "stdev": "4601563" }, { "name": "az_006", "index": "27", "type": "numeric", "distinct": "997", "missing": "19", "min": "0", "max": "12306666", "mean": "77404", "stdev": "513330" }, { "name": "az_008", "index": "28", "type": "numeric", "distinct": "75", "missing": "19", "min": "0", "max": "123522", "mean": "238", "stdev": "3543" }, { "name": "ba_002", "index": "29", "type": "numeric", "distinct": "1766", "missing": "18", "min": "0", "max": "14813740", "mean": "419830", "stdev": "1147384" }, { "name": "ba_003", "index": "30", "type": "numeric", "distinct": "1718", "missing": "18", "min": "0", "max": "10273126", "mean": "273833", "stdev": "712245" }, { "name": "ba_004", "index": "31", "type": "numeric", "distinct": "1649", "missing": "18", "min": "0", "max": "10060564", "mean": "203431", "stdev": "537280" }, { "name": "ba_005", "index": "32", "type": "numeric", "distinct": "1564", "missing": "18", "min": "0", "max": "7821608", "mean": "178211", "stdev": "455541" }, { "name": "ba_009", "index": "33", "type": "numeric", "distinct": "429", "missing": "18", "min": "0", "max": "6659902", "mean": "29477", "stdev": "232768" }, { "name": "bb_000", "index": "34", "type": "numeric", "distinct": "1957", "missing": "27", "min": "0", "max": "174389278", "mean": "4597889", "stdev": "11404393" }, { "name": "bc_000", "index": "35", "type": "numeric", "distinct": "395", "missing": "97", "min": "0", "max": "54178", "mean": "417", "stdev": "2100" }, { "name": "bg_000", "index": "36", "type": "numeric", "distinct": "1936", "missing": "24", "min": "0", "max": "74247318", "mean": "1828053", "stdev": "4334141" }, { "name": "bh_000", "index": "37", "type": "numeric", "distinct": "1675", "missing": "24", "min": "0", "max": "2585334", "mean": "61081", "stdev": "166112" }, { "name": "bi_000", "index": "38", "type": "numeric", "distinct": "1941", "missing": "22", "min": "0", "max": "22817098", "mean": "484893", "stdev": "1437278" }, { "name": "bj_000", "index": "39", "type": "numeric", "distinct": "1917", "missing": "22", "min": "0", "max": "38910080", "mean": "579711", "stdev": "2168794" }, { "name": "bm_000", "index": "40", "type": "numeric", "distinct": "526", "missing": "1321", "min": "0", "max": "1310700", "mean": "411390", "stdev": "412938" }, { "name": "bq_000", "index": "41", "type": "numeric", "distinct": "182", "missing": "1628", "min": "0", "max": "1310700", "mean": "614738", "stdev": "540652" }, { "name": "br_000", "index": "42", "type": "numeric", "distinct": "171", "missing": "1639", "min": "0", "max": "1310700", "mean": "626010", "stdev": "544531" }, { "name": "bt_000", "index": "43", "type": "numeric", "distinct": "1970", "missing": "9", "min": "0", "max": "1371632", "mean": "57472", "stdev": "132231" }, { "name": "bv_000", "index": "44", "type": "numeric", "distinct": "1953", "missing": "31", "min": "0", "max": "174389278", "mean": "4590060", "stdev": "11412497" }, { "name": "bx_000", "index": "45", "type": "numeric", "distinct": "1900", "missing": "95", "min": "182", "max": "126424790", "mean": "4064716", "stdev": "10072461" }, { "name": "by_000", "index": "46", "type": "numeric", "distinct": "1525", "missing": "11", "min": "0", "max": "679749", "mean": "21432", "stdev": "49701" }, { "name": "bz_000", "index": "47", "type": "numeric", "distinct": "1158", "missing": "97", "min": "0", "max": "7389400", "mean": "85982", "stdev": "429672" }, { "name": "cc_000", "index": "48", "type": "numeric", "distinct": "1843", "missing": "95", "min": "0", "max": "125466978", "mean": "3759942", "stdev": "9333787" }, { "name": "cd_000", "index": "49", "type": "numeric", "distinct": "1", "missing": "27", "min": "1209600", "max": "1209600", "mean": "1209600", "stdev": "0" }, { "name": "cf_000", "index": "50", "type": "numeric", "distinct": "52", "missing": "471", "min": "0", "max": "28246", "mean": "44", "stdev": "843" }, { "name": "cg_000", "index": "51", "type": "numeric", "distinct": "180", "missing": "471", "min": "0", "max": "14142", "mean": "89", "stdev": "401" }, { "name": "ch_000", "index": "52", "type": "numeric", "distinct": "1", "missing": "471", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "ci_000", "index": "53", "type": "numeric", "distinct": "1960", "missing": "11", "min": "0", "max": "95932293", "mean": "3434949", "stdev": "8009346" }, { "name": "ck_000", "index": "54", "type": "numeric", "distinct": "1959", "missing": "11", "min": "0", "max": "47910094", "mean": "826958", "stdev": "2926415" }, { "name": "cl_000", "index": "55", "type": "numeric", "distinct": "129", "missing": "318", "min": "0", "max": "123904", "mean": "280", "stdev": "4528" }, { "name": "cn_001", "index": "56", "type": "numeric", "distinct": "305", "missing": "18", "min": "0", "max": "3265886", "mean": "19219", "stdev": "149556" }, { "name": "cn_003", "index": "57", "type": "numeric", "distinct": "1838", "missing": "18", "min": "0", "max": "43543382", "mean": "556728", "stdev": "2309789" }, { "name": "cn_005", "index": "58", "type": "numeric", "distinct": "1847", "missing": "18", "min": "0", "max": "43634004", "mean": "1338556", "stdev": "3086612" }, { "name": "cn_008", "index": "59", "type": "numeric", "distinct": "1134", "missing": "18", "min": "0", "max": "6888234", "mean": "16489", "stdev": "199506" }, { "name": "cn_009", "index": "60", "type": "numeric", "distinct": "496", "missing": "18", "min": "0", "max": "4256010", "mean": "4098", "stdev": "99167" }, { "name": "co_000", "index": "61", "type": "numeric", "distinct": "320", "missing": "471", "min": "0", "max": "31170", "mean": "346", "stdev": "1697" }, { "name": "cp_000", "index": "62", "type": "numeric", "distinct": "367", "missing": "97", "min": "0", "max": "64700", "mean": "455", "stdev": "2759" }, { "name": "cr_000", "index": "63", "type": "numeric", "distinct": "4", "missing": "1565", "min": "0", "max": "9370", "mean": "41", "stdev": "586" }, { "name": "cs_000", "index": "64", "type": "numeric", "distinct": "1492", "missing": "18", "min": "0", "max": "153032", "mean": "5511", "stdev": "9995" }, { "name": "cs_002", "index": "65", "type": "numeric", "distinct": "1513", "missing": "18", "min": "0", "max": "23442916", "mean": "247090", "stdev": "1194896" }, { "name": "cs_003", "index": "66", "type": "numeric", "distinct": "1752", "missing": "18", "min": "0", "max": "23693574", "mean": "388733", "stdev": "1266581" }, { "name": "cs_004", "index": "67", "type": "numeric", "distinct": "1815", "missing": "18", "min": "0", "max": "28238966", "mean": "441344", "stdev": "1798029" }, { "name": "cs_005", "index": "68", "type": "numeric", "distinct": "1906", "missing": "18", "min": "0", "max": "86445752", "mean": "2194081", "stdev": "5233471" }, { "name": "cs_006", "index": "69", "type": "numeric", "distinct": "1903", "missing": "18", "min": "0", "max": "32527058", "mean": "552884", "stdev": "1307718" }, { "name": "cs_009", "index": "70", "type": "numeric", "distinct": "11", "missing": "18", "min": "0", "max": "82200", "mean": "42", "stdev": "1846" }, { "name": "cu_000", "index": "71", "type": "numeric", "distinct": "685", "missing": "431", "min": "0", "max": "41736", "mean": "930", "stdev": "2344" }, { "name": "cv_000", "index": "72", "type": "numeric", "distinct": "1500", "missing": "431", "min": "0", "max": "48798824", "mean": "1882996", "stdev": "3525098" }, { "name": "cx_000", "index": "73", "type": "numeric", "distinct": "1289", "missing": "431", "min": "0", "max": "14029430", "mean": "332727", "stdev": "1166134" }, { "name": "cz_000", "index": "74", "type": "numeric", "distinct": "796", "missing": "431", "min": "0", "max": "1067354", "mean": "15489", "stdev": "55993" }, { "name": "db_000", "index": "75", "type": "numeric", "distinct": "49", "missing": "431", "min": "0", "max": "654", "mean": "12", "stdev": "28" }, { "name": "dc_000", "index": "76", "type": "numeric", "distinct": "1495", "missing": "431", "min": "0", "max": "51860276", "mean": "2135623", "stdev": "3794441" }, { "name": "df_000", "index": "77", "type": "numeric", "distinct": "31", "missing": "133", "min": "0", "max": "14422520", "mean": "11666", "stdev": "352871" }, { "name": "dg_000", "index": "78", "type": "numeric", "distinct": "60", "missing": "133", "min": "0", "max": "14904042", "mean": "18279", "stdev": "441996" }, { "name": "di_000", "index": "79", "type": "numeric", "distinct": "299", "missing": "133", "min": "0", "max": "5171790", "mean": "22679", "stdev": "222556" }, { "name": "dj_000", "index": "80", "type": "numeric", "distinct": "10", "missing": "133", "min": "0", "max": "8090", "mean": "6", "stdev": "192" }, { "name": "dk_000", "index": "81", "type": "numeric", "distinct": "18", "missing": "133", "min": "0", "max": "591376", "mean": "774", "stdev": "16513" }, { "name": "dm_000", "index": "82", "type": "numeric", "distinct": "8", "missing": "133", "min": "0", "max": "6131532", "mean": "4343", "stdev": "148294" }, { "name": "dq_000", "index": "83", "type": "numeric", "distinct": "423", "missing": "97", "min": "0", "max": "2147483647", "mean": "3114498", "stdev": "90431906" }, { "name": "ds_000", "index": "84", "type": "numeric", "distinct": "1564", "missing": "97", "min": "0", "max": "3856964", "mean": "84547", "stdev": "197795" }, { "name": "dt_000", "index": "85", "type": "numeric", "distinct": "1332", "missing": "97", "min": "0", "max": "586390", "mean": "15088", "stdev": "36131" }, { "name": "dv_000", "index": "86", "type": "numeric", "distinct": "1607", "missing": "97", "min": "0", "max": "24836024", "mean": "589131", "stdev": "1683239" }, { "name": "dx_000", "index": "87", "type": "numeric", "distinct": "585", "missing": "97", "min": "0", "max": "79342208", "mean": "764403", "stdev": "4059171" }, { "name": "dy_000", "index": "88", "type": "numeric", "distinct": "398", "missing": "97", "min": "0", "max": "923120", "mean": "4806", "stdev": "31491" }, { "name": "ea_000", "index": "89", "type": "numeric", "distinct": "18", "missing": "97", "min": "0", "max": "1606", "mean": "2", "stdev": "42" }, { "name": "eb_000", "index": "90", "type": "numeric", "distinct": "1146", "missing": "133", "min": "0", "max": "980083460", "mean": "10086096", "stdev": "48209127" }, { "name": "ec_00", "index": "91", "type": "numeric", "distinct": "1553", "missing": "333", "min": "0", "max": "50056", "mean": "1309", "stdev": "3286" }, { "name": "ed_000", "index": "92", "type": "numeric", "distinct": "818", "missing": "318", "min": "0", "max": "55348", "mean": "1415", "stdev": "3337" }, { "name": "ee_000", "index": "93", "type": "numeric", "distinct": "1907", "missing": "19", "min": "0", "max": "48080352", "mean": "814454", "stdev": "2872745" }, { "name": "ee_001", "index": "94", "type": "numeric", "distinct": "1893", "missing": "19", "min": "0", "max": "46498856", "mean": "807987", "stdev": "2569462" }, { "name": "ee_003", "index": "95", "type": "numeric", "distinct": "1651", "missing": "19", "min": "0", "max": "6283128", "mean": "200561", "stdev": "451015" }, { "name": "ee_007", "index": "96", "type": "numeric", "distinct": "1500", "missing": "19", "min": "0", "max": "21911570", "mean": "341588", "stdev": "1514485" }, { "name": "ee_008", "index": "97", "type": "numeric", "distinct": "1241", "missing": "19", "min": "0", "max": "9989570", "mean": "134825", "stdev": "445512" }, { "name": "ee_009", "index": "98", "type": "numeric", "distinct": "720", "missing": "19", "min": "0", "max": "1695086", "mean": "9158", "stdev": "53362" }, { "name": "ef_000", "index": "99", "type": "numeric", "distinct": "4", "missing": "97", "min": "0", "max": "8", "mean": "0", "stdev": "0" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }