{ "data_id": "43542", "name": "7k-Books", "exact_name": "7k-Books", "version": 1, "version_label": "v1.0", "description": "Do we really need another dataset of books?\nMy initial plan was to build a toy example for a recommender system article I was writing. After a bit of googling, I found a few datasets. Sadly, most of them had some issues that made them unusable for me (e.g, missing description of the book, a mix of different languages but no column to specify the language per row or weird delimiters). \nSo I decided to make a dataset that would match my purposes.\nFirst, I got ISBNs from Soumik's Goodreads-books dataset. Using those identifiers, I crawled the Google Books API to extract the books' information.\nThen, I merged those results with some of the original columns from the dataset and after some cleaning I got the dataset you see here.\nWhat can I do with this?\nDifferent Exploratory Data Analysis, clustering of books by topics\/category, content-based recommendation engine using different fields from the book's description. \nWhy is this dataset smaller than Soumik's Goodreads-books?\nMany of the ISBNs of that dataset did not return valid results from the Google Books API. I plan to update this in the future, using more fields (e.g., title, author) in the API requests, as to have a bigger dataset.\nWhat did you use to build this dataset?\nCheck out the repoistory here Google Books Crawler\nAcknowledgements\nThis dataset relied heavily on Soumik's Goodreads-books dataset.", "format": "arff", "uploader": "Onur Yildirim", "uploader_id": 30126, "visibility": "public", "creator": null, "contributor": null, "date": "2022-03-23 13:46:08", "update_comment": null, "last_update": "2022-03-23 13:46:08", "licence": "CC0: Public Domain", "status": "active", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/22102367\/dataset", "default_target_attribute": null, "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "7k-Books", "Do we really need another dataset of books? My initial plan was to build a toy example for a recommender system article I was writing. After a bit of googling, I found a few datasets. Sadly, most of them had some issues that made them unusable for me (e.g, missing description of the book, a mix of different languages but no column to specify the language per row or weird delimiters). So I decided to make a dataset that would match my purposes. First, I got ISBNs from Soumik's Goodreads-books dat " ], "weight": 5 }, "qualities": { "NumberOfInstances": 6810, "NumberOfFeatures": 12, "NumberOfClasses": null, "NumberOfMissingValues": 5336, "NumberOfInstancesWithMissingValues": 4630, "NumberOfNumericFeatures": 5, "NumberOfSymbolicFeatures": 0, "Dimensionality": 0.001762114537444934, "PercentageOfNumericFeatures": 41.66666666666667, "MajorityClassPercentage": null, "PercentageOfSymbolicFeatures": 0, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 0, "PercentageOfBinaryFeatures": 0, "PercentageOfInstancesWithMissingValues": 67.98825256975036, "AutoCorrelation": null, "PercentageOfMissingValues": 6.529613313754283 }, "tags": [ { "uploader": "38960", "tag": "Machine Learning" }, { "uploader": "38960", "tag": "Mathematics" } ], "features": [ { "name": "isbn13", "index": "0", "type": "numeric", "distinct": "6810", "missing": "0", "min": "2147483647", "max": "2147483647", "mean": "2147483647", "stdev": "606891057" }, { "name": "isbn10", "index": "1", "type": "string", "distinct": "6810", "missing": "0" }, { "name": "title", "index": "2", "type": "string", "distinct": "6394", "missing": "4" }, { "name": "subtitle", "index": "3", "type": "string", "distinct": "2009", "missing": "4429" }, { "name": "authors", "index": "4", "type": "string", "distinct": "3775", "missing": "77" }, { "name": "categories", "index": "5", "type": "string", "distinct": "567", "missing": "99" }, { "name": "thumbnail", "index": "6", "type": "string", "distinct": "6481", "missing": "329" }, { "name": "description", "index": "7", "type": "string", "distinct": "6473", "missing": "263" }, { "name": "published_year", "index": "8", "type": "numeric", "distinct": "94", "missing": "6", "min": "1853", "max": "2019", "mean": "1999", "stdev": "10" }, { "name": "average_rating", "index": "9", "type": "numeric", "distinct": "200", "missing": "43", "min": "0", "max": "5", "mean": "4", "stdev": "0" }, { "name": "num_pages", "index": "10", "type": "numeric", "distinct": "915", "missing": "43", "min": "0", "max": "3342", "mean": "348", "stdev": "242" }, { "name": "ratings_count", "index": "11", "type": "numeric", "distinct": "3881", "missing": "43", "min": "0", "max": "5629932", "mean": "21069", "stdev": "137621" } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }