View Javadoc
1   /*
2    * This file is part of dependency-check-core.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   * Copyright (c) 2012 Jeremy Long. All Rights Reserved.
17   */
18  package org.owasp.dependencycheck.data.lucene;
19  
20  import java.io.Reader;
21  import org.apache.lucene.analysis.Analyzer;
22  import org.apache.lucene.analysis.TokenStream;
23  import org.apache.lucene.analysis.Tokenizer;
24  import org.apache.lucene.analysis.core.LowerCaseFilter;
25  import org.apache.lucene.analysis.core.StopAnalyzer;
26  import org.apache.lucene.analysis.core.StopFilter;
27  import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
28  import org.apache.lucene.util.Version;
29  
30  /**
31   * <p>
32   * A Lucene Analyzer that utilizes the WhitespaceTokenizer, WordDelimiterFilter, LowerCaseFilter, and StopFilter. The intended
33   * purpose of this Analyzer is to index the CPE fields vendor and product.</p>
34   *
35   * @author Jeremy Long
36   */
37  public class FieldAnalyzer extends Analyzer {
38  
39      /**
40       * The Lucene Version used.
41       */
42      private final Version version;
43  
44      /**
45       * Creates a new FieldAnalyzer.
46       *
47       * @param version the Lucene version
48       */
49      public FieldAnalyzer(Version version) {
50          this.version = version;
51      }
52  
53      /**
54       * Creates the TokenStreamComponents
55       *
56       * @param fieldName the field name being analyzed
57       * @param reader the reader containing the input
58       * @return the TokenStreamComponents
59       */
60      @Override
61      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
62          final Tokenizer source = new AlphaNumericTokenizer(version, reader);
63  
64          TokenStream stream = source;
65  
66          stream = new WordDelimiterFilter(stream,
67                  WordDelimiterFilter.CATENATE_WORDS
68                  | WordDelimiterFilter.GENERATE_WORD_PARTS
69                  | WordDelimiterFilter.GENERATE_NUMBER_PARTS
70                  | WordDelimiterFilter.PRESERVE_ORIGINAL
71                  | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
72                  | WordDelimiterFilter.SPLIT_ON_NUMERICS
73                  | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null);
74  
75          stream = new LowerCaseFilter(version, stream);
76          stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
77  
78          return new TokenStreamComponents(source, stream);
79      }
80  }