<?xml version="1.0" standalone="no"?>
<!DOCTYPE inputProcessorConfiguration SYSTEM "input-processor-config.dtd">


<inputProcessorConfiguration>
    <!-- the input directory of files, or list of files
        if you want to write them all out.
        these should be HTML and/or text files 
        required, repeatable -->
    <!--<input location="/dir/of/files/or/single/file" type=""/>-->
    <!-- OR -->
    <input file="/home/cluster/crawler/WebBuilder-09.23.03/www.bls.gov.xml" type="BOGUS"/>

    <!-- The output files.  Different files are defined by their outputType
        attribute.  the valid attributes are:

            DocIDMapping - file containing mapping between 
                            doc ID # and full path of do

            CleanedFile - ea. line of this file is the filtered text of a 
                            doc.  Line # corresponds to DocID (above)

            TermIDMapping - ea. line is the following mapping:
                            TermID - Term
                            where the Terms are all the unique terms from the 
                            CleanedFile, and IDs are unique #'s

            DocTermMatrix - ea. line is the following mapping:
                            DocID - TermID - TermCount
                            where DocID is from DocIDMapping, TermID is
                            from TermIDMapping and the term count is the
                            number of that particular term in the file
                            that corresponds to DocID
            
        -->
    <output fileName="/home/cluster/clustering/BLS-output/doc_docid.txt" outputType="DocIDMapping"/>
    <output fileName="/home/cluster/clustering/BLS-output/cleaned_file.txt" outputType="CleanedFile"/>
    <output fileName="/home/cluster/clustering/BLS-output/term_termid.txt" outputType="TermIDMapping"/>
    <output fileName="/home/cluster/clustering/BLS-output/matrix.txt" outputType="DocTermMatrix"/>
    <output fileName="/home/cluster/clustering/BLS-output/webMetrics.txt" outputType ="WebMetrics"/>
    
    <!-- the set of filters to apply to the input files.
        these are applied in the order specified
        optional, non-repeatable -->
    <filterSet>
        <!-- <filter> is required, repeatable -->
        <filter name="HtmlTagsFilter"/>
        <filter name="LowerCaseFilter"/>

        <!-- some filters can optionally have parameters
            parameters are optional, repeatable -->
        <filter name="WordFilter">
            <param name="pattern" value="[a-zA-Z][a-zA-Z0-9]*"/>
        </filter>

        <filter name="LengthFilter">
            <param name="minLength" value="2"/>
        </filter>

        <filter name="StopWordFilter">
            <param name="stopWordFile" value="/home/cluster/clustering/util/stopwords1.txt"/>
        </filter>

        <filter name="WordNetFilter">
            <param name="nounFile" 
                    value="/home/cluster/clustering/util/index.noun"/>
            <param name="adjFile" 
                    value="/home/cluster/clustering/util/index.adj"/>
        </filter>
    </filterSet>
</inputProcessorConfiguration>


