Voir la notice de l'article provenant de la source Library of Science
@article{IJAMCS_2018_28_4_a13, author = {Cichosz, P.}, title = {A case study in text mining of discussion forum posts: {Classification} with bag of words and global vectors}, journal = {International Journal of Applied Mathematics and Computer Science}, pages = {787--801}, publisher = {mathdoc}, volume = {28}, number = {4}, year = {2018}, language = {en}, url = {http://geodesic.mathdoc.fr/item/IJAMCS_2018_28_4_a13/} }
TY - JOUR AU - Cichosz, P. TI - A case study in text mining of discussion forum posts: Classification with bag of words and global vectors JO - International Journal of Applied Mathematics and Computer Science PY - 2018 SP - 787 EP - 801 VL - 28 IS - 4 PB - mathdoc UR - http://geodesic.mathdoc.fr/item/IJAMCS_2018_28_4_a13/ LA - en ID - IJAMCS_2018_28_4_a13 ER -
%0 Journal Article %A Cichosz, P. %T A case study in text mining of discussion forum posts: Classification with bag of words and global vectors %J International Journal of Applied Mathematics and Computer Science %D 2018 %P 787-801 %V 28 %N 4 %I mathdoc %U http://geodesic.mathdoc.fr/item/IJAMCS_2018_28_4_a13/ %G en %F IJAMCS_2018_28_4_a13
Cichosz, P. A case study in text mining of discussion forum posts: Classification with bag of words and global vectors. International Journal of Applied Mathematics and Computer Science, Tome 28 (2018) no. 4, pp. 787-801. http://geodesic.mathdoc.fr/item/IJAMCS_2018_28_4_a13/
[1] Aggarwal, C.C. and Zhai, C.-X. (Eds.) (2012). Mining Text Data, Springer, New York, NY.
[2] Aswani Kumar, C. and Srinivas, S. (2006). Latent semantic indexing using eigenvalue analysis for efficient information retrieval, International Journal of Applied Mathematics and Computer Science 16(4): 551–558.
[3] Bayes, T. (1763). An essay towards solving a problem in the doctrine of chances, Philosophical Transactions of the Royal Society of London 53: 370–418.
[4] Bilski, A. and Wojciechowski, J. (2016). Automatic parametric fault detection in complex analog systems based on a method of minimum node selection, International Journal of Applied Mathematics and Computer Science 26(3): 655–668, DOI: 10.1515/amcs-2016-0045.
[5] Blei, D.M., Ng, A.Y. and Jordan, M.I. (2003). Latent Dirichlet allocation, Journal of Machine Learning Research 3: 993–1022.
[6] Breiman, L. (1996). Bagging predictors, Machine Learning 24(2): 123–140.
[7] Breiman, L. (2001). Random forests, Machine Learning 45(1): 5–32.
[8] Breiman, L., Friedman, J.H., Olshen, R.A. and Stone, C.J. (1984). Classification and Regression Trees, Chapman and Hall, New York, NY.
[9] Cestnik, B. (1990). Estimating probabilities: A crucial task in machine learning, Proceedings of the 9th European Conference on Artificial Intelligence (ECAI-90), Stockholm, Sweden, pp. 147–149.
[10] Cichosz, P. (2015). Data Mining Algorithms: Explained Using R, Wiley, Chichester.
[11] Cortes, C. and Vapnik, V.N. (1995). Support-vector networks, Machine Learning 20(3): 273–297.
[12] Cristianini, N. and Shawe-Taylor, J. (2000). An Introduction to Support Vector Machines and Other Kernel-Based Learning Methods, Cambridge University Press, New York, NY.
[13] Dařena, F. and Žižka, J. (2017). Ensembles of classifiers for parallel categorization of large number of text documents expressing opinions, Journal of Applied Economic Sciences 12(1): 25–35.
[14] Dietterich, T.G. (2000). Ensemble methods in machine learning, Proceedings of the 1st International Workshop on Multiple Classifier Systems, Cagliari, Italy, pp. 1–15.
[15] Domingos, P. and Pazzani, M. (1997). On the optimality of the simple Bayesian classifier under zero-one loss, Machine Learning 29(2–3): 103–137.
[16] Duchi, J., Hazan, E. and Singer, Y. (2011). Adaptive subgradient methods for online learning and stochastic optimization, Journal of Machine Learning Research 12: 2121–2159.
[17] Dumais, S.T. (2005). Latent semantic analysis, Annual Review of Information Science and Technology 38(1): 188–229.
[18] Dumais, S.T., Platt, J.C., Heckerman, D. and Sahami, M. (1998). Inductive learning algorithms and representations for text categorization, Proceedings of the 7th International Conference on Information and Knowledge Management (CIKM-98), Bethesda, MD, USA, pp. 148–155.
[19] Egan, J.P. (1975). Signal Detection Theory and ROC Analysis, Academic Press, New York, NY.
[20] Fawcett, T. (2006). An introduction to ROC analysis, Pattern Recognition Letters 27(8): 861–874.
[21] Forman, G. (2003). An extensive empirical study of feature selection measures for text classification, Journal of Machine Learning Research 3: 1289–1305.
[22] Goldberg, Y. and Levy, O. (2014). word2vec Explained: Deriving Mikolov et al.’s negative sampling word-embedding method, arXiv: 1402.3722.
[23] Guyon, I.M. and Elisseeff, A. (2003). An introduction to variable and feature selection, Journal of Machine Learning Research 3: 1157–1182.
[24] Hamel, L.H. (2009). Knowledge Discovery with Support Vector Machines, Wiley, New York, NY.
[25] Hand, D.J. and Yu, K. (2001). Idiot’s Bayes—not so stupid after all?, International Statistical Review 69(3): 385–399.
[26] Heaps, H.S. (1978). Information Retrieval: Computational and Theoretical Aspects, Academic Press, New York, NY.
[27] Hilbe, J.M. (2009). Logistic Regression Models, Chapman and Hall, New York, NY.
[28] Holtz, P., Kronberger, N. and Wagner, W. (2012). Analyzing Internet forums: A practical guide, Journal of Media Psychology 24(2): 55–66.
[29] Joachims, T. (1998). Text categorization with support vector machines: Learning with many relevant features, Proceedings of the 10th European Conference on Machine Learning (ECML-98), Chemnitz, Germany, pp. 137–142.
[30] Joachims, T. (2002). Learning to Classify Text by Support Vector Machines: Methods, Theory, and Algorithms, Springer, New York, NY.
[31] Koprinska, I., Poon, J., Clark, J. and Chan, J. (2007). Learning to classify e-mail, Information Sciences: An International Journal 177(10): 2167–2187.
[32] Lau, J.H. and Baldwin, T. (2016). An empirical evaluation of doc2vec with practical insights into document embedding generation, Proceedings of the 1st Workshop on Representation Learning for NLP, Berlin, Germany, pp. 78–86.
[33] Le, Q.V. and Mikolov, T. (2014). Distributed representations of sentences and documents, Proceedings of the 31st International Conference on Machine Learning (ICML-14), Beijing, China, pp. 1188–1196.
[34] Lewis, D.D. (1998). Naive (Bayes) at forty: The independence assumption in information retrieval, Proceedings of the Tenth European Conference on Machine Learning (ECML-98), Chemnitz, Germany, pp. 4–15.
[35] Liaw, A. and Wiener, M. (2002). Classification and regression by random Forest, R News 2(3): 18–22, http://CRAN.R-project.org/doc/Rnews/.
[36] Liu, H. and Motoda, H. (1998). Feature Selection for Knowledge Discovery and Data Mining, Springer, New York, NY.
[37] Liu, H., Motoda, H., Setiono, R. and Zhao, Z. (2010). Feature selection: An ever-evolving frontier in data mining, Proceedings of the 4th Workshop on Feature Selection in Data Mining (FSDM-10), Hyderabad, India, pp. 4–13.
[38] Lui, A. K.-F., Li, S.C. and Choy, S.O. (2007). An evaluation of automatic text categorization in online discussion analysis, Proceedings of the 7th IEEE International Conference on Advanced Learning Technologies (ICALT-2007), Niigata, Japan, pp. 205–209.
[39] Manning, C.D., Raghavan, P., and Schütze, H. (2008). Introduction to Information Retrieval, Cambridge University Press, Cambridge.
[40] Marra, R.M., Moore, J.L. and Klimczak, A.K. (2004). Content analysis of online discussion forums: A comparative analysis of protocols, Educational Technology Research and Development 52(2): 23–40.
[41] McCallum, A. and Nigam, K. (1998). A comparison of event models for naive Bayes text classification, Proceedings of the AAAI/ICML-98 Workshop on Learning for Text Categorization, Madison, WI, USA, pp. 41–48.
[42] Meyer, D., Dimitriadou, E., Hornik, K., Weingessel, A. and Leisch, F. (2015). e1071: Misc Functions of the Department of Statistics, Probability Theory Group (Formerly: E1071), TU Wien, R package version 1.6-7, https://CRAN.R-project.org/package=e1071.
[43] Mikolov, T., Chen, K., Corrado, G.S. and Dean, J. (2013a). Efficient estimation of word representations in vector space, arXiv:1301.3781.
[44] Mikolov, T., Le, Q.V. and Sutskever, I. (2013b). Exploiting similarities among languages for machine translation, arXiv:1309.4168.
[45] Mitchell, J. and Lapata, M. (2010). Composition in distributional models of semantics, Cognitive Science 34(8): 1388–1429.
[46] Moldovan, A., Bot¸, R.I. and Wanka, G. (2005). Latent semantic indexing for patent documents, International Journal of Applied Mathematics and Computer Science 15(4): 551–560.
[47] Oooms, J. (2016). hunspell: Morphological Analysis and Spell Checker for R, R package version 2.3, https://CRAN.R-project.org/package=hunspell.
[48] Pennington, J., Socher, R. and Manning, C.D. (2014). GloVe: Global vectors for word representation, Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP-14), Doha, Qatar, pp. 1532–1543.
[49] Platt, J.C. (1998). Fast training of support vector machines using sequential minimal optimization, in B. Schölkopf et al. (Eds.), Advances in Kernel Methods: Support Vector Learning, MIT Press, Cambridge, MA, pp.185–208.
[50] Platt, J.C. (2000). Probabilistic outputs for support vector machines and comparison to regularized likelihood methods, in A.J. Smola et al. (Eds.), Advances in Large Margin Classifiers, MIT Press, Cambridge, MA, pp. 61–74.
[51] Quinlan, J.R. (1986). Induction of decision trees, Machine Learning 1: 81–106.
[52] R Development Core Team (2016). R: A Language and Environment for Statistical Computing, R Foundation for Statistical Computing, http://www.R-project.org.
[53] Radovanović, M. and Ivanović, M. (2008). Text mining: Approaches and applications, Novi Sad Journal of Mathematics 38(3): 227–234.
[54] Rios, G. and Zha, H. (2004). Exploring support vector machines and random forests for spam detection, Proceedings of the 1st International Conference on Email and Anti Spam (CEAS-04), Mountain View, CA, USA, pp. 398–403.
[55] Rousseau, F., Kiagias, E. and Vazirgiannis, M. (2015). Text categorization as a graph classification problem, Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics and the 6th International Joint Conference on Natural Language Processing (ACLIJCNLP-15), Beijing, China, pp. 1702–1712.
[56] Said, D. and Wanas, N. (2011). Clustering posts in online discussion forum threads, International Journal of Computer Science and Information Technology 3(2): 1–14.
[57] Schölkopf, B. and Smola, A.J. (2001). Learning with Kernels, MIT Press, Cambridge, MA.
[58] Sebastiani, F. (2002). Machine learning in automated text categorization, ACM Computing Surveys 34(1): 1–47.
[59] Selivanov, D. (2016). text2vec: Modern Text Mining Framework for R, R package version 0.4.0, https://CRAN.R-project.org/package=text2vec.
[60] Siwek, K. and Osowski, S. (2016). Data mining methods for prediction of air pollution, International Journal of Applied Mathematics and Computer Science 26(2): 467–478, DOI: 10.1515/amcs-2016-0033.
[61] Szymański, J. (2014). Comparative analysis of text representation methods using classification, Cybernetics and Systems 45(2): 180–199.
[62] Wu, Q., Ye, Y., Zhang, H., Ng, M.K. and Ho, S.-H. (2014). ForesTexter: An efficient random forest algorithm for imbalanced text categorization, Knowledge-Based Systems 67: 105–116.
[63] Xu, B., Guo, X., Ye, Y. and Cheng, J. (2012). An improved random forest classifier for text categorization, Journal of Computers 7(12): 2913–2920.
[64] Xue, D. and Li, F. (2015). Research of text categorization model based on random forests, 2015 IEEE International Conference on Computational Intelligence and Communication Technology (CICT-15), Ghaziabad, India, pp. 173–176.
[65] Yang, Y. and Pedersen, J. (1997). A comparative study on feature selection in text categorization, Proceedings of the 14th International Conference on Machine Learning (ICML-97), Nashville, TN, USA, pp. 412–420.
[66] Yessenalina, A. and Cardie, C. (2011). Compositional matrix-space models for sentiment analysis, Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing (EMNLP-11), Edinburgh, UK, pp. 172–182.