@article{oai:nagasaki-u.repo.nii.ac.jp:00010791, author = {Masada, Tomonari and Takasu, Atsuhiro and Shibata, Yuichiro and Oguri, Kiyoshi}, journal = {Lecture Notes in Business Information Processing}, month = {May}, note = {This paper provides experimental results showing that we can use maximal substrings as elementary building blocks of documents in place of the words extracted by a current state-of-the-art supervised word extraction. Maximal substrings are defined as the substrings each giving a smaller number of occurrences even by appending only one character to its head or tail. The main feature of maximal substrings is that they can be extracted quite efficiently in an unsupervised manner. We extract maximal substrings from a document set and represent each document as a bag of maximal substrings. We also obtain a bag of words representation by using a state-of-the-art supervised word extraction over the same document set. We then apply the same document clustering method to both representations and obtain two clustering results for a comparison of their quality. We adopt a Bayesian document clustering based on Dirichlet compound multinomials for avoiding overfitting. Our experiment shows that the clustering quality achieved with maximal substrings is acceptable enough to use them in place of the words extracted by a supervised word extraction., Lecture Notes in Business Information Processing, 102, pp.19-34; 2012}, pages = {19--34}, title = {Clustering Documents with Maximal Substrings}, volume = {102}, year = {2012} }