1 /*
2 * This file is part of dependency-check-core.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 * Copyright (c) 2012 Jeremy Long. All Rights Reserved.
17 */
18 package org.owasp.dependencycheck.data.lucene;
19
20 import java.io.Reader;
21 import org.apache.lucene.analysis.Analyzer;
22 import org.apache.lucene.analysis.TokenStream;
23 import org.apache.lucene.analysis.Tokenizer;
24 import org.apache.lucene.analysis.core.LowerCaseFilter;
25 import org.apache.lucene.analysis.core.StopAnalyzer;
26 import org.apache.lucene.analysis.core.StopFilter;
27 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
28 import org.apache.lucene.util.Version;
29
30 /**
31 * <p>
32 * A Lucene Analyzer that utilizes the WhitespaceTokenizer, WordDelimiterFilter, LowerCaseFilter, and StopFilter. The intended
33 * purpose of this Analyzer is to index the CPE fields vendor and product.</p>
34 *
35 * @author Jeremy Long
36 */
37 public class FieldAnalyzer extends Analyzer {
38
39 /**
40 * The Lucene Version used.
41 */
42 private final Version version;
43
44 /**
45 * Creates a new FieldAnalyzer.
46 *
47 * @param version the Lucene version
48 */
49 public FieldAnalyzer(Version version) {
50 this.version = version;
51 }
52
53 /**
54 * Creates the TokenStreamComponents
55 *
56 * @param fieldName the field name being analyzed
57 * @param reader the reader containing the input
58 * @return the TokenStreamComponents
59 */
60 @Override
61 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
62 final Tokenizer source = new AlphaNumericTokenizer(version, reader);
63
64 TokenStream stream = source;
65
66 stream = new WordDelimiterFilter(stream,
67 WordDelimiterFilter.CATENATE_WORDS
68 | WordDelimiterFilter.GENERATE_WORD_PARTS
69 | WordDelimiterFilter.GENERATE_NUMBER_PARTS
70 | WordDelimiterFilter.PRESERVE_ORIGINAL
71 | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
72 | WordDelimiterFilter.SPLIT_ON_NUMERICS
73 | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null);
74
75 stream = new LowerCaseFilter(version, stream);
76 stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
77
78 return new TokenStreamComponents(source, stream);
79 }
80 }