Authors
Jaap Kamps, Maarten Marx, Maarten de Rijke, and Börkur Sigurbjörnsson.
book
Proceedings of the First Workshop of the INitiative for the Evaluation of XML retrieval (INEX). Pages: 41-48. ERCIM Publications. 2003. [pdf]
abstract
Current information retrieval systems typically ignore structural aspects of documents, solely focusing on the textual content instead. But documents containing additional structure in the form of HTML, XML, or SGML mark-up are pervasive on the Internet. The XML retrieval task presents a number of challenges for information retrieval, for we can no longer rely on the appropriate unit of retrieval to be fixed, or to be known beforehand. This implies that the effectiveness of standard IR techniques, such as morphological normalization methods, may not carry over to this particular task. This paper describes the fully automatic runs for the INEX 2002 task submitted by the Language and Inference Technology Group at the University of Amsterdam. We investigate the effectiveness of two standard approaches to morphological normalization, both a linguistically motivated stemming algorithm and a knowledge-poor character n-gramming technique. Our results show that morphological normalization is an important issue for XML retrieval. For all measurements, the combined run and the n-gram run perform better than the stemmed run.
bibtex
@inproceedings{kamps-2023-inex,
author = {Jaap Kamps, Maarten Marx, Maarten de Rijke, and Börkur Sigurbjörnsson},
title = {The importance of morphological normalization for XML retrieval},
booktitle = {Proceedings of the First Workshop of the INitiative for the Evaluation of XML retrieval (INEX)},
year = {2003},
location = {Schloss Dagstuhl, Germany},
pages = {41–48},
publisher = {ERCIM Publications},
keywords = {information retrieval, xml retrieval, morphological normalization},
}