{ "data_id": "45067", "name": "okcupid_stem", "exact_name": "okcupid_stem", "version": 1, "version_label": "1", "description": "User profile data for San Francisco OkCupid users published in [Kim, A. Y., & Escobedo-Land, A. (2015). OKCupid data for introductory statistics and data science courses. Journal of Statistics Education, 23(2).]. The curated dataset was downloaded from [https:\/\/github.com\/rudeboybert\/JSE_OkCupid]. The original dataset was created with the use of a python script that pulled the data from public profiles on www.okcupid.com on 06\/30\/2012. It includes people (n = 59946) within a 25 mile radius of San Francisco, who were online in the last year (06\/30\/2011), with at least one profile picture. Permission to use this data was obtained by the author of the original paper from OkCupid president and co-founder Christian Rudder under the condition that the dataset remains public. As target, the variable 'job' was collapsed into three categories: 'stem', 'non_stem', and 'student'. STEM jobs were defined as 'job' %in% c('computer \/ hardware \/ software', 'science \/ tech \/ engineering'). Observations with 'job' %in% c('unemployed', 'retired', 'rather not say') or missing values in 'job' were removed. The factor labels of the variable 'speaks' had to be changed to integers to prevent a bug which would not allow the upload of the variable as a nominal feature. The original dataset also included ten open text variables 'essay0' to 'essay9', which were removed from the dataset uploaded here. The dataset further includes the date\/time variable 'last_online' (ignored by default) which could be used to construct additional features. Using OkCupid data for predicting STEM jobs was inspired by Max Kuhns book 'Feature Engineering and Selection: A Practical Approach for Predictive Models' [https:\/\/bookdown.org\/max\/FES\/].", "format": "arff", "uploader": "Young Lee", "uploader_id": 31892, "visibility": "public", "creator": "\"Yoontae Hwang, Youngbin Lee, Yongjae Lee\"", "contributor": null, "date": "2023-01-27 11:22:54", "update_comment": null, "last_update": "2023-01-27 11:22:54", "licence": "Public", "status": "active", "error_message": null, "url": "https:\/\/api.openml.org\/data\/download\/22112025\/dataset", "default_target_attribute": "class", "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "okcupid_stem", "User profile data for San Francisco OkCupid users published in [Kim, A. Y., & Escobedo-Land, A. (2015). OKCupid data for introductory statistics and data science courses. Journal of Statistics Education, 23(2).]. The curated dataset was downloaded from [https:\/\/github.com\/rudeboybert\/JSE_OkCupid]. The original dataset was created with the use of a python script that pulled the data from public profiles on www.okcupid.com on 06\/30\/2012. It includes people (n = 59946) within a 25 mile radius of Sa " ], "weight": 5 }, "qualities": { "NumberOfInstances": 26677, "NumberOfFeatures": 14, "NumberOfClasses": 3, "NumberOfMissingValues": 0, "NumberOfInstancesWithMissingValues": 0, "NumberOfNumericFeatures": 2, "NumberOfSymbolicFeatures": 11, "MajorityClassSize": 19209, "MinorityClassPercentage": 10.548412490160063, "MinorityClassSize": 2814, "NumberOfBinaryFeatures": 1, "PercentageOfBinaryFeatures": 7.142857142857142, "PercentageOfInstancesWithMissingValues": 0, "AutoCorrelation": 1, "PercentageOfMissingValues": 0, "Dimensionality": 0.0005247966413014956, "PercentageOfNumericFeatures": 14.285714285714285, "MajorityClassPercentage": 72.00584773400307, "PercentageOfSymbolicFeatures": 78.57142857142857 }, "tags": [ { "uploader": "38960", "tag": "Life Science" }, { "uploader": "38960", "tag": "Machine Learning" } ], "features": [ { "name": "class", "index": "13", "type": "string", "distinct": "3", "missing": "0", "target": "1" }, { "name": "age", "index": "0", "type": "numeric", "distinct": "52", "missing": "0", "min": "18", "max": "69", "mean": "33", "stdev": "10" }, { "name": "height", "index": "1", "type": "numeric", "distinct": "44", "missing": "0", "min": "3", "max": "95", "mean": "68", "stdev": "4" }, { "name": "body_type", "index": "2", "type": "nominal", "distinct": "12", "missing": "0", "distr": [] }, { "name": "drinks", "index": "3", "type": "nominal", "distinct": "6", "missing": "0", "distr": [] }, { "name": "drugs", "index": "4", "type": "nominal", "distinct": "3", "missing": "0", "distr": [] }, { "name": "education", "index": "5", "type": "nominal", "distinct": "32", "missing": "0", "distr": [] }, { "name": "ethnicity", "index": "6", "type": "nominal", "distinct": "179", "missing": "0", "distr": [] }, { "name": "location", "index": "7", "type": "nominal", "distinct": "145", "missing": "0", "distr": [] }, { "name": "orientation", "index": "8", "type": "nominal", "distinct": "3", "missing": "0", "distr": [] }, { "name": "sex", "index": "9", "type": "nominal", "distinct": "2", "missing": "0", "distr": [] }, { "name": "sign", "index": "10", "type": "nominal", "distinct": "48", "missing": "0", "distr": [] }, { "name": "smokes", "index": "11", "type": "nominal", "distinct": "5", "missing": "0", "distr": [] }, { "name": "status", "index": "12", "type": "nominal", "distinct": "5", "missing": "0", "distr": [] } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }