{ "data_id": "478", "name": "collins", "exact_name": "collins", "version": 1, "version_label": null, "description": "**Author**: Jeff Collins \r\n**Source**: StatLib \r\n**Please cite**: \r\n\r\n**Deactivated because this version only has half the data and has clear label leakage cause by 'Genre'. Use version 4 instead.**\r\n\r\nThe following are data used in an analysis of the Brown and Frown corpora for my doctoral dissertation titled ``Variations in Written English: Characterizing Authors' Rhetorical Language Choices Across Corpora of Published Texts\" (Completed at Carnegie Mellon Univ, 2003). The source of the corpora was the ICAME CD-ROM (get info at ).\r\n\r\nThe data were generated from the texts using tagging and visualization software, Docuscope.\r\n\r\nThe first row is the variable names. The genre of each text (assigned by the Brown corpus compilers) is in 'Genre' column and the corpus is listed in the 'corpus' column with 1=Brown and 2=Frown corpus.\r\n\r\nThe dataset may be freely used and distributed for non-commercial purposes.\r\n\r\nJeff Collins 11 July 2003\r\n\r\n\r\n\r\nInformation about the dataset\r\nCLASSTYPE: nominal\r\nCLASSINDEX: last", "format": "ARFF", "uploader": "Joaquin Vanschoren", "uploader_id": 2, "visibility": "public", "creator": null, "contributor": null, "date": "2014-09-28 23:51:43", "update_comment": "attribute counter is a row id", "last_update": "2015-04-15 17:08:50", "licence": "Public", "status": "deactivated", "error_message": null, "url": "https:\/\/www.openml.org\/data\/download\/52590\/collins.arff", "default_target_attribute": "Corp.Genre", "row_id_attribute": "Counter", "ignore_attribute": "\"Text\"", "runs": 32238, "suggest": { "input": [ "collins", "The following are data used in an analysis of the Brown and Frown corpora for my doctoral dissertation titled ``Variations in Written English: Characterizing Authors' Rhetorical Language Choices Across Corpora of Published Texts\" (Completed at Carnegie Mellon Univ, 2003). The source of the corpora was the ICAME CD-ROM (get info at ). The data were generated from the texts using tagging and visualization software, Docuscope. The first row is the variable names. The genre of each text (assig " ], "weight": 5 }, "qualities": { "NumberOfInstances": 500, "NumberOfFeatures": 22, "NumberOfClasses": 15, "NumberOfMissingValues": 0, "NumberOfInstancesWithMissingValues": 0, "NumberOfNumericFeatures": 19, "NumberOfSymbolicFeatures": 3, "MeanSkewnessOfNumericAtts": 1.21821335635969, "Quartile1StdDevOfNumericAtts": 0.41142614811251527, "REPTreeDepth2AUC": 0.49574650513494795, "CfsSubsetEval_kNN1NErrRate": 0, "kNN1NAUC": 1, "J48.001.Kappa": 1, "MeanStdDevOfNumericAtts": 1.0452797721655507, "Quartile2AttributeEntropy": 3.648562076012777, "REPTreeDepth2ErrRate": 0.84, "CfsSubsetEval_kNN1NKappa": 1, "kNN1NErrRate": 0, "MajorityClassPercentage": 16, "MinAttributeEntropy": -0, "Quartile2KurtosisOfNumericAtts": 0.7633424303002827, "REPTreeDepth2Kappa": 0, "ClassEntropy": 3.648562076012777, "kNN1NKappa": 1, "MajorityClassSize": 80, "MinKurtosisOfNumericAtts": -0.5698798457138974, "Quartile2MeansOfNumericAtts": 1.32206, "REPTreeDepth3AUC": 0.49574650513494795, "DecisionStumpAUC": 0.6860989947887339, "MaxAttributeEntropy": 8.965784284662018, "MinMeansOfNumericAtts": 0.28419999999999995, "Quartile2MutualInformation": 3.64856207601278, "REPTreeDepth3ErrRate": 0.84, "DecisionStumpErrRate": 0.738, "MaxKurtosisOfNumericAtts": 29.62455127322131, "MinMutualInformation": 0, "Quartile2SkewnessOfNumericAtts": 0.8729927853121452, "REPTreeDepth3Kappa": 0, "DecisionStumpKappa": 0.13972779028941518, "MaxMeansOfNumericAtts": 31.48718, "MinNominalAttDistinctValues": 1, "PercentageOfBinaryFeatures": 0, "Quartile2StdDevOfNumericAtts": 0.7706544869632264, "RandomTreeDepth1AUC": 0.9399046820683302, "Dimensionality": 0.044, "MaxMutualInformation": 3.64856207601278, "MinSkewnessOfNumericAtts": -0.06825964191203086, "PercentageOfInstancesWithMissingValues": 0, "Quartile3AttributeEntropy": 8.965784284662018, "RandomTreeDepth1ErrRate": 0.284, "EquivalentNumberOfAtts": 1.499999999999999, "MaxNominalAttDistinctValues": 500, "MinStdDevOfNumericAtts": 0.27275253966882085, "PercentageOfMissingValues": 0, "Quartile3KurtosisOfNumericAtts": 3.7036204365443157, "AutoCorrelation": 0.9719438877755511, "RandomTreeDepth1Kappa": 0.6846504728022136, "J48.00001.AUC": 1, "MaxSkewnessOfNumericAtts": 4.220469825930483, "MinorityClassPercentage": 1.2, "PercentageOfNumericFeatures": 86.36363636363636, "Quartile3MeansOfNumericAtts": 2.6724200000000002, "CfsSubsetEval_DecisionStumpAUC": 1, "RandomTreeDepth2AUC": 0.9399046820683302, "J48.00001.ErrRate": 0, "MaxStdDevOfNumericAtts": 5.208705277603819, "MinorityClassSize": 6, "PercentageOfSymbolicFeatures": 13.636363636363635, "Quartile3MutualInformation": 3.64856207601278, "CfsSubsetEval_DecisionStumpErrRate": 0, "RandomTreeDepth2ErrRate": 0.284, "J48.00001.Kappa": 1, "MeanAttributeEntropy": 4.204782120224931, "NaiveBayesAUC": 0.9219931754984056, "Quartile1AttributeEntropy": 0, "Quartile3SkewnessOfNumericAtts": 1.6103948341786611, "CfsSubsetEval_DecisionStumpKappa": 1, "RandomTreeDepth2Kappa": 0.6846504728022136, "J48.0001.AUC": 1, "MeanKurtosisOfNumericAtts": 4.030981426007789, "NaiveBayesErrRate": 0.36, "Quartile1KurtosisOfNumericAtts": 0.31263763178831194, "Quartile3StdDevOfNumericAtts": 1.010259841572945, "CfsSubsetEval_NaiveBayesAUC": 1, "RandomTreeDepth3AUC": 0.9399046820683302, "J48.0001.ErrRate": 0, "MeanMeansOfNumericAtts": 3.31444, "MeanMutualInformation": 2.4323747173418533, "NaiveBayesKappa": 0.6030906011854361, "Quartile1MeansOfNumericAtts": 0.6347200000000002, "REPTreeDepth1AUC": 0.49574650513494795, "CfsSubsetEval_NaiveBayesErrRate": 0, "RandomTreeDepth3ErrRate": 0.284, "J48.0001.Kappa": 1, "MeanNoiseToSignalRatio": 0.7286736662104428, "NumberOfBinaryFeatures": 0, "Quartile1MutualInformation": 0, "REPTreeDepth1ErrRate": 0.84, "CfsSubsetEval_NaiveBayesKappa": 1, "RandomTreeDepth3Kappa": 0.6846504728022136, "J48.001.AUC": 1, "MeanNominalAttDistinctValues": 132.75, "Quartile1SkewnessOfNumericAtts": 0.5431345075805047, "REPTreeDepth1Kappa": 0, "CfsSubsetEval_kNN1NAUC": 1, "StdvNominalAttDistinctValues": 244.9222665799634, "J48.001.ErrRate": 0 }, "tags": [ { "tag": "OpenML100", "uploader": "348" }, { "tag": "study_1", "uploader": "2" }, { "tag": "study_123", "uploader": "3886" }, { "tag": "study_14", "uploader": "64" }, { "tag": "study_34", "uploader": "1" }, { "tag": "study_7", "uploader": "64" }, { "tag": "trivial", "uploader": "1140" } ], "features": [ { "name": "Corp.Genre", "index": "23", "type": "nominal", "distinct": "15", "missing": "0", "target": "1", "distr": [ [ "101", "102", "103", "104", "105", "106", "107", "108", "109", "110", "111", "112", "113", "114", "115" ], [ [ "44", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "27", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "17", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "17", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "36", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "48", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "75", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "30", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "80", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "29", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "24", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "6", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "29", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "29", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "9" ] ] ] }, { "name": "Text", "index": "0", "type": "nominal", "distinct": "500", "missing": "0", "ignore": "1", "distr": [] }, { "name": "FirstPerson", "index": "1", "type": "numeric", "distinct": "139", "missing": "0", "min": "0", "max": "6", "mean": "1", "stdev": "1" }, { "name": "InnerThinking", "index": "2", "type": "numeric", "distinct": "262", "missing": "0", "min": "1", "max": "6", "mean": "3", "stdev": "1" }, { "name": "ThinkPositive", "index": "3", "type": "numeric", "distinct": "154", "missing": "0", "min": "0", "max": "2", "mean": "1", "stdev": "0" }, { "name": "ThinkNegative", "index": "4", "type": "numeric", "distinct": "216", "missing": "0", "min": "0", "max": "4", "mean": "1", "stdev": "1" }, { "name": "ThinkAhead", "index": "5", "type": "numeric", "distinct": "172", "missing": "0", "min": "0", "max": "4", "mean": "1", "stdev": "0" }, { "name": "ThinkBack", "index": "6", "type": "numeric", "distinct": "130", "missing": "0", "min": "0", "max": "2", "mean": "1", "stdev": "0" }, { "name": "Reasoning", "index": "7", "type": "numeric", "distinct": "262", "missing": "0", "min": "1", "max": "6", "mean": "3", "stdev": "1" }, { "name": "Share_SocTies", "index": "8", "type": "numeric", "distinct": "260", "missing": "0", "min": "0", "max": "6", "mean": "2", "stdev": "1" }, { "name": "Direct_Activity", "index": "9", "type": "numeric", "distinct": "78", "missing": "0", "min": "0", "max": "3", "mean": "0", "stdev": "0" }, { "name": "Interacting", "index": "10", "type": "numeric", "distinct": "160", "missing": "0", "min": "0", "max": "8", "mean": "1", "stdev": "1" }, { "name": "Notifying", "index": "11", "type": "numeric", "distinct": "218", "missing": "0", "min": "1", "max": "5", "mean": "3", "stdev": "1" }, { "name": "LinearGuidance", "index": "12", "type": "numeric", "distinct": "352", "missing": "0", "min": "0", "max": "11", "mean": "5", "stdev": "2" }, { "name": "WordPicture", "index": "13", "type": "numeric", "distinct": "371", "missing": "0", "min": "1", "max": "15", "mean": "5", "stdev": "2" }, { "name": "SpaceInterval", "index": "14", "type": "numeric", "distinct": "228", "missing": "0", "min": "0", "max": "4", "mean": "1", "stdev": "1" }, { "name": "Motion", "index": "15", "type": "numeric", "distinct": "123", "missing": "0", "min": "0", "max": "2", "mean": "1", "stdev": "0" }, { "name": "PastEvents", "index": "16", "type": "numeric", "distinct": "283", "missing": "0", "min": "0", "max": "6", "mean": "2", "stdev": "1" }, { "name": "TimeInterval", "index": "17", "type": "numeric", "distinct": "173", "missing": "0", "min": "0", "max": "5", "mean": "1", "stdev": "1" }, { "name": "ShiftingEvents", "index": "18", "type": "numeric", "distinct": "132", "missing": "0", "min": "0", "max": "2", "mean": "1", "stdev": "0" }, { "name": "Text_Coverage", "index": "19", "type": "numeric", "distinct": "443", "missing": "0", "min": "17", "max": "43", "mean": "31", "stdev": "5" }, { "name": "Genre", "index": "20", "type": "nominal", "distinct": "15", "missing": "0", "distr": [ [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15" ], [ [ "44", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "27", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "17", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "17", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "36", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "48", "0", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "75", "0", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "30", "0", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "80", "0", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "29", "0", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "24", "0", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "6", "0", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "29", "0", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "29", "0" ], [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "9" ] ] ] }, { "name": "Counter", "index": "21", "type": "numeric", "distinct": "500", "missing": "0", "identifier": "1", "min": "1", "max": "500", "mean": "251", "stdev": "144" }, { "name": "Corpus", "index": "22", "type": "nominal", "distinct": "1", "missing": "0", "distr": [ [ "1" ], [ [ "44", "27", "17", "17", "36", "48", "75", "30", "80", "29", "24", "6", "29", "29", "9" ] ] ] } ], "nr_of_issues": 0, "nr_of_downvotes": 0, "nr_of_likes": 0, "nr_of_downloads": 0, "total_downloads": 0, "reach": 0, "reuse": 0, "impact_of_reuse": 0, "reach_of_reuse": 0, "impact": 0 }