{ "data_id": "43504", "name": "1-million-Reddit-comments-from-40-subreddits", "exact_name": "1-million-Reddit-comments-from-40-subreddits", "version": 1, "version_label": "v1.0", "description": "Content\nThis data is an extract from a bigger reddit dataset (All reddit comments from May 2019, 157Gb or data uncompressed) that contains both more comments and more associated informations (timestamps, author, flairs etc).\nFor ease of use, I picked the first 25 000 comments for each of the 40 most frequented subreddits (May 2019), this was if anyone wants to us the subreddit as categorical data, the volumes are balanced.\nI also excluded any removed comments \/ comments whose author got deleted and comments deemed too short (less than 4 tokens) and changed the format (json - csv).\nThis is primarily a NLP dataset, but in addition to the comments I added the 3 features I deemed the most important, I also aimed for feature type variety.\nThe information kept here is:\n\nsubreddit (categorical): on which subreddit the comment was posted\nbody (str): comment content\ncontroversiality (binary): a reddit aggregated metric\nscore (scalar): upvotes minus downvotes\n\nAcknowledgements\nThe data is but a small extract of what is being collected by pushshift.io on a monthly basis. You easily find the full information if you want to work with more features and more data.\nWhat can I do with that?\nHave fun! The variety of feature types should allow you to gain a few interesting insights or build some simple models.\nNote\nIf you think the License (CC0: Public Domain) should be different, contact me", "format": "arff", "uploader": "Onur Yildirim", "uploader_id": 30126, "visibility": "public", "creator": null, "contributor": null, "date": "2022-03-23 13:31:10", "update_comment": null, "last_update": "2022-03-23 13:31:10", "licence": "CC0: Public Domain", "status": "active", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/22102329\/dataset", "default_target_attribute": null, "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "1-million-Reddit-comments-from-40-subreddits", "Content This data is an extract from a bigger reddit dataset (All reddit comments from May 2019, 157Gb or data uncompressed) that contains both more comments and more associated informations (timestamps, author, flairs etc). For ease of use, I picked the first 25 000 comments for each of the 40 most frequented subreddits (May 2019), this was if anyone wants to us the subreddit as categorical data, the volumes are balanced. I also excluded any removed comments \/ comments whose author got deleted " ], "weight": 5 }, "qualities": { "NumberOfInstances": 1000000, "NumberOfFeatures": 4, "NumberOfClasses": null, "NumberOfMissingValues": 1, "NumberOfInstancesWithMissingValues": 1, "NumberOfNumericFeatures": 2, "NumberOfSymbolicFeatures": 0, "Dimensionality": 4.0e-6, "PercentageOfNumericFeatures": 50, "MajorityClassPercentage": null, "PercentageOfSymbolicFeatures": 0, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 0, "PercentageOfBinaryFeatures": 0, "PercentageOfInstancesWithMissingValues": 9.999999999999999e-5, "AutoCorrelation": null, "PercentageOfMissingValues": 2.4999999999999998e-5 }, "tags": [ { "uploader": "38960", "tag": "Computer Systems" }, { "uploader": "38960", "tag": "Machine Learning" } ], "features": [ { "name": "subreddit", "index": "0", "type": "string", "distinct": "40", "missing": "0" }, { "name": "body", "index": "1", "type": "string", "distinct": "963903", "missing": "1" }, { "name": "controversiality", "index": "2", "type": "numeric", "distinct": "2", "missing": "0", "min": "0", "max": "1", "mean": "0", "stdev": "0" }, { "name": "score", "index": "3", "type": "numeric", "distinct": "2110", "missing": "0", "min": "-889", "max": "35619", "mean": "12", "stdev": "150" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }