{ "data_id": "46099", "name": "Phishing_Email_Dataset", "exact_name": "Phishing_Email_Dataset", "version": 1, "version_label": null, "description": "### Description:\n\nThe dataset named \"phishing_email.csv\" comprises email contents that have been classified into phishing or legitimate categories. Each row in the dataset is an email entry, containing two fields: `text_combined` and `label`. The `text_combined` field holds the entire content of an email, which may include the subject, body, and any embedded URLs, while the `label` fields classify the email as phishing (`1`) or legitimate (`0`).\n\n### Attribute Description:\n\n- **text_combined**: Contains a comprehensive dumped text of an email, amalgamating the subject, the body, and possibly URLs. The text is unstructured, potentially lengthy, and may exhibit a wide range of natural language features, including informal language, technical terminology, and various linguistic structures. Examples of content range from technical support emails, linguistic textbook descriptions, corporate summaries regarding energy market negotiations, to phishing schemes pretending to offer financial opportunities.\n- **label**: A binary indicator with `1` representing a phishing email and `0` signifying a legitimate email. This classification serves as the dataset's target variable for predictive modeling tasks aimed at identifying phishing attempts.\n\n### Use Case:\n\nThis dataset can significantly contribute to cybersecurity efforts, particularly in developing machine learning models capable of detecting and filtering phishing attempts from legitimate email communications. Researchers and developers can leverage the rich, varied content of the emails to train models that understand the nuances and patterns indicative of phishing. Additionally, linguistic analysts may find the dataset beneficial for studying language use in fraudulent versus legitimate emails, potentially uncovering linguistic markers that are characteristic of phishing attempts. Moreover, organizations focused on strengthening their email security protocols can use insights derived from this dataset to better educate their employees on recognizing and handling suspicious emails, ultimately reducing the risk of phishing attacks.", "format": "arff", "uploader": "Iwo Godzwon", "uploader_id": 39999, "visibility": "public", "creator": "*Al-Subaiey, A., Al-Thani, M., Alam, N. A., Antora, K. F., Khandakar, A., & Zaman, S. A. U", "contributor": "\"None\"", "date": "2024-05-31 17:48:43", "update_comment": null, "last_update": "2024-05-31 17:48:43", "licence": "Attribution-ShareAlike (CC BY-SA)", "status": "active", "error_message": null, "url": "https:\/\/api.openml.org\/data\/download\/22120543\/dataset", "kaggle_url": null, "default_target_attribute": null, "row_id_attribute": null, "ignore_attribute": null, "runs": 0, "suggest": { "input": [ "Phishing_Email_Dataset", "### Description: The dataset named \"phishing_email.csv\" comprises email contents that have been classified into phishing or legitimate categories. Each row in the dataset is an email entry, containing two fields: `text_combined` and `label`. The `text_combined` field holds the entire content of an email, which may include the subject, body, and any embedded URLs, while the `label` fields classify the email as phishing (`1`) or legitimate (`0`). ### Attribute Description: - **text_combined**: Con " ], "weight": 5 }, "qualities": { "NumberOfInstances": 82486, "NumberOfFeatures": 2, "NumberOfClasses": null, "NumberOfMissingValues": 0, "NumberOfInstancesWithMissingValues": 0, "NumberOfNumericFeatures": 0, "NumberOfSymbolicFeatures": 1, "PercentageOfBinaryFeatures": 50, "PercentageOfInstancesWithMissingValues": 0, "PercentageOfMissingValues": 0, "AutoCorrelation": null, "PercentageOfNumericFeatures": 0, "Dimensionality": 2.424653880658536e-5, "PercentageOfSymbolicFeatures": 50, "MajorityClassPercentage": null, "MajorityClassSize": null, "MinorityClassPercentage": null, "MinorityClassSize": null, "NumberOfBinaryFeatures": 1 }, "tags": [], "features": [ { "name": "text_combined", "index": "0", "type": "string", "distinct": "82078", "missing": "0" }, { "name": "label", "index": "1", "type": "nominal", "distinct": "2", "missing": "0", "distr": [] } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }