{ "data_id": "43712", "name": "515K-Hotel-Reviews-Data-in-Europe", "exact_name": "515K-Hotel-Reviews-Data-in-Europe", "version": 1, "version_label": "v1.0", "description": "Acknowledgements\nThe data was scraped from Booking.com. All data in the file is publicly available to everyone already. Data is originally owned by Booking.com. Please contact me through my profile if you want to use this dataset somewhere else.\nData Context\nThis dataset contains 515,000 customer reviews and scoring of 1493 luxury hotels across Europe. Meanwhile, the geographical location of hotels are also provided for further analysis.\nData Content\nThe csv file contains 17 fields. The description of each field is as below:\n\nHotel_Address: Address of hotel. \nReview_Date: Date when reviewer posted the corresponding review.\nAverage_Score: Average Score of the hotel, calculated based on the latest comment in the last year.\nHotel_Name: Name of Hotel\nReviewer_Nationality: Nationality of Reviewer\nNegative_Review: Negative Review the reviewer gave to the hotel. If the reviewer does not give the negative review, then it should be: 'No Negative'\nReviewTotalNegativeWordCounts: Total number of words in the negative review.\nPositive_Review: Positive Review the reviewer gave to the hotel. If the reviewer does not give the negative review, then it should be: 'No Positive'\nReviewTotalPositiveWordCounts: Total number of words in the positive review.\nReviewer_Score: Score the reviewer has given to the hotel, based on his\/her experience\nTotalNumberofReviewsReviewerHasGiven: Number of Reviews the reviewers has given in the past.\nTotalNumberof_Reviews: Total number of valid reviews the hotel has.\nTags: Tags reviewer gave the hotel.\ndayssincereview: Duration between the review date and scrape date.\nAdditionalNumberof_Scoring: There are also some guests who just made a scoring on the service rather than a review. This number indicates how many valid scores without review in there.\nlat: Latitude of the hotel\nlng: longtitude of the hotel\n\nIn order to keep the text data clean, I removed unicode and punctuation in the text data and transform text into lower case. No other preprocessing was performed.\nInspiration\nThe dataset is large and informative, I believe you can have a lot of fun with it! Let me put some ideas below to futher inspire kagglers!\n\nFit a regression model on reviews and score to see which words are more indicative to a higher\/lower score\nPerform a sentiment analysis on the reviews\nFind correlation between reviewer's nationality and scores.\nBeautiful and informative visualization on the dataset.\nClustering hotels based on reviews\nSimple recommendation engine to the guest who is fond of a special characteristic of hotel.\n\nThe idea is unlimited! Please, have a look into data, generate some ideas and leave a master kernel here! I am ready to upvote your ideas and kernels! Cheers!", "format": "arff", "uploader": "Dustin Carrion", "uploader_id": 30123, "visibility": "public", "creator": null, "contributor": null, "date": "2022-03-24 07:31:03", "update_comment": null, "last_update": "2022-03-24 07:31:03", "licence": "CC0: Public Domain", "status": "active", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/22102537\/dataset", "default_target_attribute": null, "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "515K-Hotel-Reviews-Data-in-Europe", "Acknowledgements The data was scraped from Booking.com. All data in the file is publicly available to everyone already. Data is originally owned by Booking.com. Please contact me through my profile if you want to use this dataset somewhere else. Data Context This dataset contains 515,000 customer reviews and scoring of 1493 luxury hotels across Europe. Meanwhile, the geographical location of hotels are also provided for further analysis. Data Content The csv file contains 17 fields. The descript " ], "weight": 5 }, "qualities": { "NumberOfInstances": 515738, "NumberOfFeatures": 17, "NumberOfClasses": null, "NumberOfMissingValues": 6536, "NumberOfInstancesWithMissingValues": 3268, "NumberOfNumericFeatures": 9, "NumberOfSymbolicFeatures": 0, "Dimensionality": 3.2962473193753414e-5, "PercentageOfNumericFeatures": 52.94117647058824, "MajorityClassPercentage": null, "PercentageOfSymbolicFeatures": 0, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 0, "PercentageOfBinaryFeatures": 0, "PercentageOfInstancesWithMissingValues": 0.6336550729246245, "AutoCorrelation": null, "PercentageOfMissingValues": 0.07454765563819112 }, "tags": [ { "uploader": "38960", "tag": "Transportation" } ], "features": [ { "name": "Hotel_Address", "index": "0", "type": "string", "distinct": "1493", "missing": "0" }, { "name": "Additional_Number_of_Scoring", "index": "1", "type": "numeric", "distinct": "480", "missing": "0", "min": "1", "max": "2682", "mean": "498", "stdev": "501" }, { "name": "Review_Date", "index": "2", "type": "string", "distinct": "731", "missing": "0" }, { "name": "Average_Score", "index": "3", "type": "numeric", "distinct": "34", "missing": "0", "min": "5", "max": "10", "mean": "8", "stdev": "1" }, { "name": "Hotel_Name", "index": "4", "type": "string", "distinct": "1492", "missing": "0" }, { "name": "Reviewer_Nationality", "index": "5", "type": "string", "distinct": "227", "missing": "0" }, { "name": "Negative_Review", "index": "6", "type": "string", "distinct": "330011", "missing": "0" }, { "name": "Review_Total_Negative_Word_Counts", "index": "7", "type": "numeric", "distinct": "402", "missing": "0", "min": "0", "max": "408", "mean": "19", "stdev": "30" }, { "name": "Total_Number_of_Reviews", "index": "8", "type": "numeric", "distinct": "1142", "missing": "0", "min": "43", "max": "16670", "mean": "2744", "stdev": "2317" }, { "name": "Positive_Review", "index": "9", "type": "string", "distinct": "412601", "missing": "0" }, { "name": "Review_Total_Positive_Word_Counts", "index": "10", "type": "numeric", "distinct": "365", "missing": "0", "min": "0", "max": "395", "mean": "18", "stdev": "22" }, { "name": "Total_Number_of_Reviews_Reviewer_Has_Given", "index": "11", "type": "numeric", "distinct": "198", "missing": "0", "min": "1", "max": "355", "mean": "7", "stdev": "11" }, { "name": "Reviewer_Score", "index": "12", "type": "numeric", "distinct": "37", "missing": "0", "min": "3", "max": "10", "mean": "8", "stdev": "2" }, { "name": "Tags", "index": "13", "type": "string", "distinct": "55242", "missing": "0" }, { "name": "days_since_review", "index": "14", "type": "string", "distinct": "731", "missing": "0" }, { "name": "lat", "index": "15", "type": "numeric", "distinct": "1472", "missing": "3268", "min": "41", "max": "52", "mean": "49", "stdev": "3" }, { "name": "lng", "index": "16", "type": "numeric", "distinct": "1472", "missing": "3268", "min": "0", "max": "16", "mean": "3", "stdev": "5" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }