{ "data_id": "43586", "name": "U.S.-Pollution-Data", "exact_name": "U.S.-Pollution-Data", "version": 1, "version_label": "v1.0", "description": "Context\nThis dataset deals with pollution in the U.S. Pollution in the U.S. has been well documented by the U.S. EPA but it is a pain to download all the data and arrange them in a format that interests data scientists. Hence I gathered four major pollutants (Nitrogen Dioxide, Sulphur Dioxide, Carbon Monoxide and Ozone) for every day from 2000 - 2016 and place them neatly in a CSV file. \nContent\nThere is a total of 28 fields. The four pollutants (NO2, O3, SO2 and O3) each has 5 specific columns. Observations totaled to over 1.4 million. This kernel provides a good introduction to this dataset!\nFor observations on specific columns visit the Column Metadata on the Data tab.\nAcknowledgements\nAll the data is scraped from the database of U.S. EPA : https:\/\/aqsdr1.epa.gov\/aqsweb\/aqstmp\/airdata\/download_files.html \nInspiration\nI did a related project with some of my friends in college, and decided to open source our dataset so that data scientists don't need to re-scrape the U.S. EPA site for historical pollution data.", "format": "arff", "uploader": "Dustin Carrion", "uploader_id": 30123, "visibility": "public", "creator": null, "contributor": null, "date": "2022-03-24 00:23:56", "update_comment": null, "last_update": "2022-03-24 00:23:56", "licence": "Database: Open Database, Contents: Database Contents", "status": "active", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/22102411\/dataset", "default_target_attribute": null, "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "U.S.-Pollution-Data", "Context This dataset deals with pollution in the U.S. Pollution in the U.S. has been well documented by the U.S. EPA but it is a pain to download all the data and arrange them in a format that interests data scientists. Hence I gathered four major pollutants (Nitrogen Dioxide, Sulphur Dioxide, Carbon Monoxide and Ozone) for every day from 2000 - 2016 and place them neatly in a CSV file. Content There is a total of 28 fields. The four pollutants (NO2, O3, SO2 and O3) each has 5 specific columns. " ], "weight": 5 }, "qualities": { "NumberOfInstances": 1746661, "NumberOfFeatures": 29, "NumberOfClasses": null, "NumberOfMissingValues": 1746230, "NumberOfInstancesWithMissingValues": 1309785, "NumberOfNumericFeatures": 20, "NumberOfSymbolicFeatures": 0, "Dimensionality": 1.660310730015727e-5, "PercentageOfNumericFeatures": 68.96551724137932, "MajorityClassPercentage": null, "PercentageOfSymbolicFeatures": 0, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 0, "PercentageOfBinaryFeatures": 0, "PercentageOfInstancesWithMissingValues": 74.98793412116032, "AutoCorrelation": null, "PercentageOfMissingValues": 3.447424977497459 }, "tags": [ { "uploader": "38960", "tag": "Computer Systems" }, { "uploader": "38960", "tag": "Machine Learning" } ], "features": [ { "name": "Unnamed:_0", "index": "0", "type": "numeric", "distinct": "134576", "missing": "0", "min": "0", "max": "134575", "mean": "54714", "stdev": "33729" }, { "name": "State_Code", "index": "1", "type": "numeric", "distinct": "47", "missing": "0", "min": "1", "max": "80", "mean": "22", "stdev": "17" }, { "name": "County_Code", "index": "2", "type": "numeric", "distinct": "73", "missing": "0", "min": "1", "max": "650", "mean": "72", "stdev": "79" }, { "name": "Site_Num", "index": "3", "type": "numeric", "distinct": "110", "missing": "0", "min": "1", "max": "9997", "mean": "1118", "stdev": "2003" }, { "name": "Address", "index": "4", "type": "string", "distinct": "204", "missing": "0" }, { "name": "State", "index": "5", "type": "string", "distinct": "47", "missing": "0" }, { "name": "County", "index": "6", "type": "string", "distinct": "133", "missing": "0" }, { "name": "City", "index": "7", "type": "string", "distinct": "144", "missing": "0" }, { "name": "Date_Local", "index": "8", "type": "string", "distinct": "5996", "missing": "0" }, { "name": "NO2_Units", "index": "9", "type": "string", "distinct": "1", "missing": "0" }, { "name": "NO2_Mean", "index": "10", "type": "numeric", "distinct": "31859", "missing": "0", "min": "-2", "max": "140", "mean": "13", "stdev": "10" }, { "name": "NO2_1st_Max_Value", "index": "11", "type": "numeric", "distinct": "990", "missing": "0", "min": "-2", "max": "267", "mean": "25", "stdev": "16" }, { "name": "NO2_1st_Max_Hour", "index": "12", "type": "numeric", "distinct": "24", "missing": "0", "min": "0", "max": "23", "mean": "12", "stdev": "8" }, { "name": "NO2_AQI", "index": "13", "type": "numeric", "distinct": "129", "missing": "0", "min": "0", "max": "132", "mean": "24", "stdev": "15" }, { "name": "O3_Units", "index": "14", "type": "string", "distinct": "1", "missing": "0" }, { "name": "O3_Mean", "index": "15", "type": "numeric", "distinct": "8196", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "O3_1st_Max_Value", "index": "16", "type": "numeric", "distinct": "134", "missing": "0", "min": "0", "max": "0", "mean": "0", "stdev": "0" }, { "name": "O3_1st_Max_Hour", "index": "17", "type": "numeric", "distinct": "24", "missing": "0", "min": "0", "max": "23", "mean": "10", "stdev": "4" }, { "name": "O3_AQI", "index": "18", "type": "numeric", "distinct": "125", "missing": "0", "min": "0", "max": "218", "mean": "36", "stdev": "20" }, { "name": "SO2_Units", "index": "19", "type": "string", "distinct": "1", "missing": "0" }, { "name": "SO2_Mean", "index": "20", "type": "numeric", "distinct": "12736", "missing": "0", "min": "-2", "max": "322", "mean": "2", "stdev": "3" }, { "name": "SO2_1st_Max_Value", "index": "21", "type": "numeric", "distinct": "921", "missing": "0", "min": "-2", "max": "351", "mean": "4", "stdev": "8" }, { "name": "SO2_1st_Max_Hour", "index": "22", "type": "numeric", "distinct": "24", "missing": "0", "min": "0", "max": "23", "mean": "10", "stdev": "7" }, { "name": "SO2_AQI", "index": "23", "type": "numeric", "distinct": "140", "missing": "872907", "min": "0", "max": "200", "mean": "7", "stdev": "12" }, { "name": "CO_Units", "index": "24", "type": "string", "distinct": "1", "missing": "0" }, { "name": "CO_Mean", "index": "25", "type": "numeric", "distinct": "34123", "missing": "0", "min": "0", "max": "8", "mean": "0", "stdev": "0" }, { "name": "CO_1st_Max_Value", "index": "26", "type": "numeric", "distinct": "2698", "missing": "0", "min": "0", "max": "20", "mean": "1", "stdev": "1" }, { "name": "CO_1st_Max_Hour", "index": "27", "type": "numeric", "distinct": "24", "missing": "0", "min": "0", "max": "23", "mean": "8", "stdev": "8" }, { "name": "CO_AQI", "index": "28", "type": "numeric", "distinct": "107", "missing": "873323", "min": "0", "max": "201", "mean": "6", "stdev": "6" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }