View Javadoc
1   /*
2    * This file is part of dependency-check-core.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   * Copyright (c) 2012 Jeremy Long. All Rights Reserved.
17   */
18  package org.owasp.dependencycheck.data.lucene;
19  
20  import java.io.Reader;
21  import org.apache.lucene.analysis.Analyzer;
22  import org.apache.lucene.analysis.TokenStream;
23  import org.apache.lucene.analysis.Tokenizer;
24  import org.apache.lucene.analysis.core.LowerCaseFilter;
25  import org.apache.lucene.analysis.core.StopAnalyzer;
26  import org.apache.lucene.analysis.core.StopFilter;
27  import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
28  import org.apache.lucene.util.Version;
29  
30  /**
31   * A Lucene field analyzer used to analyzer queries against the CPE data.
32   *
33   * @author Jeremy Long
34   */
35  public class SearchFieldAnalyzer extends Analyzer {
36  
37      /**
38       * The Lucene Version used.
39       */
40      private final Version version;
41      /**
42       * A local reference to the TokenPairConcatenatingFilter so that we can clear any left over state if this analyzer is re-used.
43       */
44      private TokenPairConcatenatingFilter concatenatingFilter;
45  
46      /**
47       * Constructs a new SearchFieldAnalyzer.
48       *
49       * @param version the Lucene version
50       */
51      public SearchFieldAnalyzer(Version version) {
52          this.version = version;
53      }
54  
55      /**
56       * Creates a the TokenStreamComponents used to analyze the stream.
57       *
58       * @param fieldName the field that this lucene analyzer will process
59       * @param reader a reader containing the tokens
60       * @return the token stream filter chain
61       */
62      @Override
63      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
64          final Tokenizer source = new AlphaNumericTokenizer(version, reader);
65  
66          TokenStream stream = source;
67  
68          stream = new WordDelimiterFilter(stream,
69                  WordDelimiterFilter.GENERATE_WORD_PARTS
70                  | WordDelimiterFilter.GENERATE_NUMBER_PARTS
71                  | WordDelimiterFilter.PRESERVE_ORIGINAL
72                  | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
73                  | WordDelimiterFilter.SPLIT_ON_NUMERICS
74                  | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null);
75  
76          stream = new LowerCaseFilter(version, stream);
77          stream = new UrlTokenizingFilter(stream);
78          concatenatingFilter = new TokenPairConcatenatingFilter(stream);
79          stream = concatenatingFilter;
80          stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
81  
82          return new TokenStreamComponents(source, stream);
83      }
84  
85      /**
86       * <p>
87       * Resets the analyzer and clears any internal state data that may have been left-over from previous uses of the analyzer.</p>
88       * <p>
89       * <b>If this analyzer is re-used this method must be called between uses.</b></p>
90       */
91      public void clear() {
92          if (concatenatingFilter != null) {
93              concatenatingFilter.clear();
94          }
95      }
96  }