{ "data_id": "43483", "name": "Pima-Indians-Diabetes-Dataset", "exact_name": "Pima-Indians-Diabetes-Dataset", "version": 1, "version_label": "v1.0", "description": "Context\nThe unprocessed dataset was acquired from UCI Machine Learning organisation. This dataset is preprocessed by me, originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to accurately predict whether or not, a patient has diabetes, based on multiple features included in the dataset. I've achieved an accuracy metric score of 92.86 with Random Forest Classifier using this dataset. I've even developed a web-service Diabetes Prediction System using that trained model. You can explore the Exploratory Data Analysis notebook to better understand the data.\n\nAttributes Normal Value Range\n\nGlucose: Glucose ( 140) = Normal, Glucose (140-200) = Pre-Diabetic, Glucose ( 200) = Diabetic\nBloodPressure: B.P ( 60) = Below Normal, B.P (60-80) = Normal, B.P (80-90) = Stage 1 Hypertension, B.P (90-120) = Stage 2 Hypertension, B.P ( 120) = Hypertensive Crisis\nSkinThickness: SkinThickness ( 10) = Below Normal, SkinThickness (10-30) = Normal, SkinThickness ( 30) = Above Normal\nInsulin: Insulin ( 200) = Normal, Insulin ( 200) = Above Normal\nBMI: BMI ( 18.5) = Underweight, BMI (18.5-25) = Normal, BMI (25-30) = Overweight, BMI ( 30) = Obese\n\nAcknowledgements\nJ. W. Smith, J. E. Everhart, W. C. Dickson, W. C. Knowler and R. S. Johannes, \"Using the ADAP Learning Algorithm to Forecast the Onset of Diabetes Mellitus\" in Proc. of the Symposium on Computer Applications and Medical Care, pp. 261-265. IEEE Computer Society Press. 1988.\n\nInspiration\nMultiple models were trained on the original dataset but only Random Forest Classifier was able to score an accuracy metric of 78.57 but with this new preprocessed dataset an accuracy metric score of 92.86 was achieved. Can you build a machine learning model that can accurately predict whether a patient has diabetes or not? and can you achieve an accuracy metric score even higher than 92.86 without overfitting the model?", "format": "arff", "uploader": "Onur Yildirim", "uploader_id": 30126, "visibility": "public", "creator": null, "contributor": null, "date": "2022-03-23 13:26:55", "update_comment": null, "last_update": "2022-03-23 13:26:55", "licence": "CC0: Public Domain", "status": "active", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/22102308\/dataset", "kaggle_url": null, "default_target_attribute": "Outcome", "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "Pima-Indians-Diabetes-Dataset", "Context The unprocessed dataset was acquired from UCI Machine Learning organisation. This dataset is preprocessed by me, originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to accurately predict whether or not, a patient has diabetes, based on multiple features included in the dataset. I've achieved an accuracy metric score of 92.86 with Random Forest Classifier using this dataset. I've even developed a web-service Diabetes Predic " ], "weight": 5 }, "qualities": { "NumberOfInstances": 768, "NumberOfFeatures": 9, "NumberOfClasses": 0, "NumberOfMissingValues": 0, "NumberOfInstancesWithMissingValues": 0, "NumberOfNumericFeatures": 9, "NumberOfSymbolicFeatures": 0, "PercentageOfBinaryFeatures": 0, "PercentageOfInstancesWithMissingValues": 0, "PercentageOfMissingValues": 0, "AutoCorrelation": 0.5501955671447197, "PercentageOfNumericFeatures": 100, "Dimensionality": 0.01171875, "PercentageOfSymbolicFeatures": 0, "MajorityClassPercentage": null, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 0 }, "tags": [], "features": [ { "name": "Outcome", "index": "8", "type": "numeric", "distinct": "2", "missing": "0", "target": "1", "min": "0", "max": "1", "mean": "0", "stdev": "0" }, { "name": "Pregnancies", "index": "0", "type": "numeric", "distinct": "17", "missing": "0", "min": "0", "max": "17", "mean": "4", "stdev": "3" }, { "name": "Glucose", "index": "1", "type": "numeric", "distinct": "135", "missing": "0", "min": "44", "max": "199", "mean": "122", "stdev": "30" }, { "name": "BloodPressure", "index": "2", "type": "numeric", "distinct": "47", "missing": "0", "min": "24", "max": "122", "mean": "72", "stdev": "12" }, { "name": "SkinThickness", "index": "3", "type": "numeric", "distinct": "50", "missing": "0", "min": "7", "max": "99", "mean": "29", "stdev": "9" }, { "name": "Insulin", "index": "4", "type": "numeric", "distinct": "187", "missing": "0", "min": "14", "max": "846", "mean": "142", "stdev": "89" }, { "name": "BMI", "index": "5", "type": "numeric", "distinct": "247", "missing": "0", "min": "18", "max": "67", "mean": "32", "stdev": "7" }, { "name": "DiabetesPedigreeFunction", "index": "6", "type": "numeric", "distinct": "517", "missing": "0", "min": "0", "max": "2", "mean": "0", "stdev": "0" }, { "name": "Age", "index": "7", "type": "numeric", "distinct": "52", "missing": "0", "min": "21", "max": "81", "mean": "33", "stdev": "12" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }