{ "data_id": "43705", "name": "California-Housing-Prices", "exact_name": "California-Housing-Prices", "version": 1, "version_label": "v1.0", "description": "Context\nThis is the dataset used in the second chapter of Aurlien Gron's recent book 'Hands-On Machine learning with Scikit-Learn and TensorFlow'. It serves as an excellent introduction to implementing machine learning algorithms because it requires rudimentary data cleaning, has an easily understandable list of variables and sits at an optimal size between being to toyish and too cumbersome.\nThe data contains information from the 1990 California census. So although it may not help you with predicting current housing prices like the Zillow Zestimate dataset, it does provide an accessible introductory dataset for teaching people about the basics of machine learning.\nContent\nThe data pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data. Be warned the data aren't cleaned so there are some preprocessing steps required! The columns are as follows, their names are pretty self explanitory:\nlongitude\nlatitude\nhousingmedianage\ntotal_rooms\ntotal_bedrooms\npopulation\nhouseholds\nmedian_income\nmedianhousevalue\nocean_proximity\nAcknowledgements\nThis data was initially featured in the following paper:\nPace, R. Kelley, and Ronald Barry. \"Sparse spatial autoregressions.\" Statistics Probability Letters 33.3 (1997): 291-297.\nand I encountered it in 'Hands-On Machine learning with Scikit-Learn and TensorFlow' by Aurlien Gron.\nAurlien Gron wrote:\nThis dataset is a modified version of the California Housing dataset available from:\nLus Torgo's page (University of Porto)\nInspiration\nSee my kernel on machine learning basics in R using this dataset, or venture over to the following link for a python based introductory tutorial: https:\/\/github.com\/ageron\/handson-ml\/tree\/master\/datasets\/housing", "format": "arff", "uploader": "Dustin Carrion", "uploader_id": 30123, "visibility": "public", "creator": null, "contributor": null, "date": "2022-03-24 07:16:49", "update_comment": null, "last_update": "2022-03-24 07:16:49", "licence": "CC0: Public Domain", "status": "active", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/22102530\/dataset", "default_target_attribute": null, "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "California-Housing-Prices", "Context This is the dataset used in the second chapter of Aurlien Gron's recent book 'Hands-On Machine learning with Scikit-Learn and TensorFlow'. It serves as an excellent introduction to implementing machine learning algorithms because it requires rudimentary data cleaning, has an easily understandable list of variables and sits at an optimal size between being to toyish and too cumbersome. The data contains information from the 1990 California census. So although it may not help you with pred " ], "weight": 5 }, "qualities": { "NumberOfInstances": 20640, "NumberOfFeatures": 10, "NumberOfClasses": null, "NumberOfMissingValues": 207, "NumberOfInstancesWithMissingValues": 207, "NumberOfNumericFeatures": 9, "NumberOfSymbolicFeatures": 0, "Dimensionality": 0.00048449612403100775, "PercentageOfNumericFeatures": 90, "MajorityClassPercentage": null, "PercentageOfSymbolicFeatures": 0, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 0, "PercentageOfBinaryFeatures": 0, "PercentageOfInstancesWithMissingValues": 1.002906976744186, "AutoCorrelation": null, "PercentageOfMissingValues": 0.1002906976744186 }, "tags": [ { "uploader": "38960", "tag": "Demographics" }, { "uploader": "38960", "tag": "Sociology" } ], "features": [ { "name": "longitude", "index": "0", "type": "numeric", "distinct": "844", "missing": "0", "min": "-124", "max": "0", "mean": "-120", "stdev": "2" }, { "name": "latitude", "index": "1", "type": "numeric", "distinct": "862", "missing": "0", "min": "33", "max": "42", "mean": "36", "stdev": "2" }, { "name": "housing_median_age", "index": "2", "type": "numeric", "distinct": "52", "missing": "0", "min": "1", "max": "52", "mean": "29", "stdev": "13" }, { "name": "total_rooms", "index": "3", "type": "numeric", "distinct": "5926", "missing": "0", "min": "2", "max": "39320", "mean": "2636", "stdev": "2182" }, { "name": "total_bedrooms", "index": "4", "type": "numeric", "distinct": "1923", "missing": "207", "min": "1", "max": "6445", "mean": "538", "stdev": "421" }, { "name": "population", "index": "5", "type": "numeric", "distinct": "3888", "missing": "0", "min": "3", "max": "35682", "mean": "1425", "stdev": "1132" }, { "name": "households", "index": "6", "type": "numeric", "distinct": "1815", "missing": "0", "min": "1", "max": "6082", "mean": "500", "stdev": "382" }, { "name": "median_income", "index": "7", "type": "numeric", "distinct": "12928", "missing": "0", "min": "0", "max": "15", "mean": "4", "stdev": "2" }, { "name": "median_house_value", "index": "8", "type": "numeric", "distinct": "3842", "missing": "0", "min": "14999", "max": "500001", "mean": "206856", "stdev": "115396" }, { "name": "ocean_proximity", "index": "9", "type": "string", "distinct": "5", "missing": "0" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }