{ "data_id": "43794", "name": "Tweets-with-keyword-lockdown-in-April-July-2020", "exact_name": "Tweets-with-keyword-lockdown-in-April-July-2020", "version": 1, "version_label": "v1.0", "description": "Context\nThis data was collected to be used with an academic project of mine. The project was about sentiment analysis of tweets during lockdown.\nContent\nI used the GetOldTweets3 (https:\/\/pypi.org\/project\/GetOldTweets3\/) python3 library to pull the tweets off Twitter. The tweets range between 1 April 2020 to 1 August 2020, which was the peak lockdown period in India. Tweets with duplicate text and NaN values and that was the only cleaning I did on the data.\nTotal rows of tweets: 95488\nColumns:\n\nIndex (be sure to use df = pandas.read_csv(\"tweets_lockdown.csv\", index_col=0))\nText - The text of the tweet\nDate - Date and time of tweet in datetime format\nRetweets - Number of retweets for the tweet\nFavorites - Favorites on the tweet\nMentions - Usernames mentioned in the tweets in format\nHashTags - Hashtags present in the tweet in format\n\n\"Top Tweets\" attribute was turned off while scraping.\nInspiration\nTwitter data gives us a lot of scope for data cleaning, text preprocessing, association rule mining, sentiment analysis and so on.", "format": "arff", "uploader": "Elif Ceren Gok", "uploader_id": 30125, "visibility": "public", "creator": null, "contributor": null, "date": "2022-03-24 10:33:50", "update_comment": null, "last_update": "2022-03-24 10:33:50", "licence": "CC0: Public Domain", "status": "active", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/22102619\/dataset", "default_target_attribute": null, "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "Tweets-with-keyword-lockdown-in-April-July-2020", "Context This data was collected to be used with an academic project of mine. The project was about sentiment analysis of tweets during lockdown. Content I used the GetOldTweets3 (https:\/\/pypi.org\/project\/GetOldTweets3\/) python3 library to pull the tweets off Twitter. The tweets range between 1 April 2020 to 1 August 2020, which was the peak lockdown period in India. Tweets with duplicate text and NaN values and that was the only cleaning I did on the data. Total rows of tweets: 95488 Columns: In " ], "weight": 5 }, "qualities": { "NumberOfInstances": 95488, "NumberOfFeatures": 7, "NumberOfClasses": null, "NumberOfMissingValues": 160244, "NumberOfInstancesWithMissingValues": 90899, "NumberOfNumericFeatures": 3, "NumberOfSymbolicFeatures": 0, "Dimensionality": 7.330764075067024e-5, "PercentageOfNumericFeatures": 42.857142857142854, "MajorityClassPercentage": null, "PercentageOfSymbolicFeatures": 0, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 0, "PercentageOfBinaryFeatures": 0, "PercentageOfInstancesWithMissingValues": 95.19416052278821, "AutoCorrelation": null, "PercentageOfMissingValues": 23.97369302949062 }, "tags": [ { "uploader": "38960", "tag": "Life Science" }, { "uploader": "38960", "tag": "Medicine" } ], "features": [ { "name": "Unnamed:_0", "index": "0", "type": "numeric", "distinct": "95488", "missing": "0", "min": "0", "max": "95487", "mean": "47744", "stdev": "27565" }, { "name": "Text", "index": "1", "type": "string", "distinct": "95344", "missing": "19" }, { "name": "Date", "index": "2", "type": "string", "distinct": "58281", "missing": "0" }, { "name": "Retweets", "index": "3", "type": "numeric", "distinct": "321", "missing": "0", "min": "0", "max": "4680", "mean": "2", "stdev": "39" }, { "name": "Favorites", "index": "4", "type": "numeric", "distinct": "607", "missing": "0", "min": "0", "max": "23953", "mean": "10", "stdev": "193" }, { "name": "Mentions", "index": "5", "type": "string", "distinct": "8729", "missing": "82588" }, { "name": "HashTags", "index": "6", "type": "string", "distinct": "12795", "missing": "77637" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }