improve usage of lucene, add stop words, code cleanup, etc.

2026-03-02 22:20:18 +01:00 · 2017-08-13 13:18:58 -04:00
parent fb2b3159e8
commit bf5aafe455
7 changed files with 54 additions and 167 deletions
--- a/dependency-check-core/src/main/java/org/owasp/dependencycheck/analyzer/CPEAnalyzer.java
+++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/analyzer/CPEAnalyzer.java
@@ -27,6 +27,7 @@ import java.util.Set;
 import java.util.StringTokenizer;
 import java.util.concurrent.TimeUnit;
 import org.apache.commons.lang3.builder.CompareToBuilder;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.queryparser.classic.ParseException;
@@ -39,6 +40,7 @@ import org.owasp.dependencycheck.data.cpe.Fields;
 import org.owasp.dependencycheck.data.cpe.IndexEntry;
 import org.owasp.dependencycheck.data.cpe.IndexException;
 import org.owasp.dependencycheck.data.lucene.LuceneUtils;
+import org.owasp.dependencycheck.data.lucene.SearchFieldAnalyzer;
 import org.owasp.dependencycheck.data.nvdcve.CveDB;
 import org.owasp.dependencycheck.data.nvdcve.DatabaseException;
 import org.owasp.dependencycheck.dependency.Confidence;
@@ -212,7 +214,6 @@ public class CPEAnalyzer extends AbstractAnalyzer {
     * @throws ParseException is thrown when the Lucene query cannot be parsed.
     */
    protected void determineCPE(Dependency dependency) throws CorruptIndexException, IOException, ParseException {
-        //TODO test dojo-war against this. we should get dojo-toolkit:dojo-toolkit AND dojo-toolkit:toolkit
        String vendors = "";
        String products = "";
        for (Confidence confidence : Confidence.values()) {
@@ -488,7 +489,11 @@ public class CPEAnalyzer extends AbstractAnalyzer {
        final String[] words = text.split("[\\s_-]");
        final List<String> list = new ArrayList<>();
        String tempWord = null;
+        CharArraySet stopWords = SearchFieldAnalyzer.getStopWords();
        for (String word : words) {
+            if (stopWords.contains(word)) {
+                continue;
+            }
            /*
             single letter words should be concatenated with the next word.
             so { "m", "core", "sample" } -> { "mcore", "sample" }
@@ -561,6 +566,9 @@ public class CPEAnalyzer extends AbstractAnalyzer {
    protected boolean determineIdentifiers(Dependency dependency, String vendor, String product,
            Confidence currentConfidence) throws UnsupportedEncodingException {
        final Set<VulnerableSoftware> cpes = cve.getCPEs(vendor, product);
+        if (cpes.isEmpty()) {
+            return false;
+        }
        DependencyVersion bestGuess = new DependencyVersion("-");
        Confidence bestGuessConf = null;
        boolean hasBroadMatch = false;
--- a/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/cpe/CpeMemoryIndex.java
+++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/cpe/CpeMemoryIndex.java
@@ -52,7 +52,7 @@ import org.slf4j.LoggerFactory;
 *
 * @author Jeremy Long
 */
-public final class CpeMemoryIndex {
+public final class CpeMemoryIndex implements AutoCloseable {

    /**
     * The logger.
@@ -160,6 +160,7 @@ public final class CpeMemoryIndex {
    /**
     * Closes the CPE Index.
     */
+    @Override
    public synchronized void close() {
        if (searchingAnalyzer != null) {
            searchingAnalyzer.close();
@@ -206,7 +207,6 @@ public final class CpeMemoryIndex {
                    v.setStringValue(pair.getLeft());
                    p.setStringValue(pair.getRight());
                    indexWriter.addDocument(doc);
-                    resetFieldAnalyzer();
                }
            }
            indexWriter.commit();
@@ -221,18 +221,6 @@ public final class CpeMemoryIndex {
        }
    }

-    /**
-     * Resets the product and vendor field analyzers.
-     */
-    private void resetFieldAnalyzer() {
-        if (productFieldAnalyzer != null) {
-            productFieldAnalyzer.clear();
-        }
-        if (vendorFieldAnalyzer != null) {
-            vendorFieldAnalyzer.clear();
-        }
-    }
-
    /**
     * Searches the index using the given search string.
     *
@@ -248,7 +236,6 @@ public final class CpeMemoryIndex {
            throw new ParseException("Query is null or empty");
        }
        LOGGER.debug(searchString);
-        resetFieldAnalyzer();
        final Query query = queryParser.parse(searchString);
        return search(query, maxQueryResults);
    }
@@ -263,7 +250,6 @@ public final class CpeMemoryIndex {
     * @throws IOException thrown if there is an IOException
     */
    public synchronized TopDocs search(Query query, int maxQueryResults) throws CorruptIndexException, IOException {
-        resetFieldAnalyzer();
        return indexSearcher.search(query, maxQueryResults);
    }

--- a/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/lucene/FieldAnalyzer.java
+++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/lucene/FieldAnalyzer.java
@@ -1,84 +0,0 @@
-/*
- * This file is part of dependency-check-core.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Copyright (c) 2012 Jeremy Long. All Rights Reserved.
- */
-package org.owasp.dependencycheck.data.lucene;
-
-import java.io.Reader;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
-import org.apache.lucene.util.Version;
-
-/**
- * <p>
- * A Lucene Analyzer that utilizes the WhitespaceTokenizer, WordDelimiterFilter,
- * LowerCaseFilter, and StopFilter. The intended purpose of this Analyzer is to
- * index the CPE fields vendor and product.</p>
- *
- * @author Jeremy Long
- * @deprecated the field analyzer should not be used, instead use the
- * SearchFieldAnalyzer so that the token analyzing filter is used.
- */
-@Deprecated
-public class FieldAnalyzer extends Analyzer {
-
-    /**
-     * The Lucene Version used.
-     */
-    private final Version version;
-
-    /**
-     * Creates a new FieldAnalyzer.
-     *
-     * @param version the Lucene version
-     */
-    public FieldAnalyzer(Version version) {
-        this.version = version;
-    }
-
-    /**
-     * Creates the TokenStreamComponents
-     *
-     * @param fieldName the field name being analyzed
-     * @param reader the reader containing the input
-     * @return the TokenStreamComponents
-     */
-    @Override
-    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        final Tokenizer source = new AlphaNumericTokenizer(version, reader);
-
-        TokenStream stream = source;
-
-        stream = new WordDelimiterFilter(stream,
-                WordDelimiterFilter.CATENATE_WORDS
-                | WordDelimiterFilter.GENERATE_WORD_PARTS
-                | WordDelimiterFilter.GENERATE_NUMBER_PARTS
-                | WordDelimiterFilter.PRESERVE_ORIGINAL
-                | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
-                | WordDelimiterFilter.SPLIT_ON_NUMERICS
-                | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null);
-
-        stream = new LowerCaseFilter(version, stream);
-        stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-
-        return new TokenStreamComponents(source, stream);
-    }
-}
--- a/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/lucene/SearchFieldAnalyzer.java
+++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/lucene/SearchFieldAnalyzer.java
@@ -18,6 +18,8 @@
 package org.owasp.dependencycheck.data.lucene;

 import java.io.Reader;
+import java.util.Arrays;
+import java.util.List;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -25,6 +27,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.Version;

 /**
@@ -39,9 +42,25 @@ public class SearchFieldAnalyzer extends Analyzer {
     */
    private final Version version;
    /**
-     * A local reference to the TokenPairConcatenatingFilter so that we can clear any left over state if this analyzer is re-used.
+     * The list of additional stop words to use.
     */
-    private TokenPairConcatenatingFilter concatenatingFilter;
+    private static final List<String> ADDITIONAL_STOP_WORDS = Arrays.asList("software", "framework", "inc",
+            "com", "org", "net", "www", "consulting", "ltd", "foundation", "project");
+    /**
+     * The set of stop words to use in the analyzer.
+     */
+    private final CharArraySet stopWords;
+
+    /**
+     * Returns the set of stop words being used.
+     *
+     * @return the set of stop words being used
+     */
+    public static CharArraySet getStopWords() {
+        CharArraySet words = new CharArraySet(LuceneUtils.CURRENT_VERSION, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
+        words.addAll(ADDITIONAL_STOP_WORDS);
+        return words;
+    }

    /**
     * Constructs a new SearchFieldAnalyzer.
@@ -50,6 +69,7 @@ public class SearchFieldAnalyzer extends Analyzer {
     */
    public SearchFieldAnalyzer(Version version) {
        this.version = version;
+        stopWords = getStopWords();
    }

    /**
@@ -75,22 +95,9 @@ public class SearchFieldAnalyzer extends Analyzer {

        stream = new LowerCaseFilter(version, stream);
        stream = new UrlTokenizingFilter(stream);
-        concatenatingFilter = new TokenPairConcatenatingFilter(stream);
-        stream = concatenatingFilter;
-        stream = new StopFilter(version, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+        stream = new StopFilter(version, stream, stopWords);
+        stream = new TokenPairConcatenatingFilter(stream);

        return new TokenStreamComponents(source, stream);
    }
-
-    /**
-     * <p>
-     * Resets the analyzer and clears any internal state data that may have been left-over from previous uses of the analyzer.</p>
-     * <p>
-     * <b>If this analyzer is re-used this method must be called between uses.</b></p>
-     */
-    public void clear() {
-        if (concatenatingFilter != null) {
-            concatenatingFilter.clear();
-        }
-    }
 }
--- a/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/lucene/TokenPairConcatenatingFilter.java
+++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/data/lucene/TokenPairConcatenatingFilter.java
@@ -25,9 +25,11 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 /**
 * <p>
- * Takes a TokenStream and adds additional tokens by concatenating pairs of words.</p>
+ * Takes a TokenStream and adds additional tokens by concatenating pairs of
+ * words.</p>
 * <p>
- * <b>Example:</b> "Spring Framework Core" -&gt; "Spring SpringFramework Framework FrameworkCore Core".</p>
+ * <b>Example:</b> "Spring Framework Core" -&gt; "Spring SpringFramework
+ * Framework FrameworkCore Core".</p>
 *
 * @author Jeremy Long
 */
@@ -46,24 +48,6 @@ public final class TokenPairConcatenatingFilter extends TokenFilter {
     */
    private final LinkedList<String> words;

-    /**
-     * Returns the previous word. This is needed in the test cases.
-     *
-     * @return te previous word
-     */
-    protected String getPreviousWord() {
-        return previousWord;
-    }
-
-    /**
-     * Returns the words list. This is needed in the test cases.
-     *
-     * @return the words list
-     */
-    protected LinkedList<String> getWords() {
-        return words;
-    }
-
    /**
     * Constructs a new TokenPairConcatenatingFilter.
     *
@@ -75,8 +59,9 @@ public final class TokenPairConcatenatingFilter extends TokenFilter {
    }

    /**
-     * Increments the underlying TokenStream and sets CharTermAttributes to construct an expanded set of tokens by concatenating
-     * tokens with the previous token.
+     * Increments the underlying TokenStream and sets CharTermAttributes to
+     * construct an expanded set of tokens by concatenating tokens with the
+     * previous token.
     *
     * @return whether or not we have hit the end of the TokenStream
     * @throws IOException is thrown when an IOException occurs
@@ -112,11 +97,18 @@ public final class TokenPairConcatenatingFilter extends TokenFilter {

    /**
     * <p>
-     * Resets the Filter and clears any internal state data that may have been left-over from previous uses of the Filter.</p>
+     * Resets the Filter and clears any internal state data that may have been
+     * left-over from previous uses of the Filter.</p>
     * <p>
-     * <b>If this Filter is re-used this method must be called between uses.</b></p>
+     * <b>If this Filter is re-used this method must be called between
+     * uses.</b></p>
+     *
+     * @throws java.io.IOException thrown if there is an error resetting the
+     * filter
     */
-    public void clear() {
+    @Override
+    public void end() throws IOException {
+        super.end();
        previousWord = null;
        words.clear();
    }
@@ -158,5 +150,4 @@ public final class TokenPairConcatenatingFilter extends TokenFilter {
        }
        return !(this.words != other.words && (this.words == null || !this.words.equals(other.words)));
    }
-
 }
--- a/dependency-check-core/src/test/java/org/owasp/dependencycheck/data/lucene/FieldAnalyzerTest.java
+++ b/dependency-check-core/src/test/java/org/owasp/dependencycheck/data/lucene/FieldAnalyzerTest.java
@@ -52,7 +52,7 @@ public class FieldAnalyzerTest extends BaseTest {
    @Test
    public void testAnalyzers() throws Exception {

-        Analyzer analyzer = new FieldAnalyzer(LuceneUtils.CURRENT_VERSION);
+        Analyzer analyzer = new SearchFieldAnalyzer(LuceneUtils.CURRENT_VERSION);
        Directory index = new RAMDirectory();

        String field1 = "product";
@@ -93,12 +93,10 @@ public class FieldAnalyzerTest extends BaseTest {
        assertEquals("springframework", searcher.doc(hits[0].doc).get(field1));
        assertEquals("springsource", searcher.doc(hits[0].doc).get(field2));

-        searchAnalyzerProduct.clear(); //ensure we don't have anything left over from the previous search.
-        searchAnalyzerVendor.clear();
        querystr = "product:(Apache Struts) vendor:(Apache)";
        Query q2 = parser.parse(querystr);
        assertFalse("second parsing contains previousWord from the TokenPairConcatenatingFilter", q2.toString().contains("core"));
-        
+
        querystr = "product:(  x-stream^5 )  AND  vendor:(  thoughtworks.xstream )";
        Query q3 = parser.parse(querystr);
        collector = TopScoreDocCollector.create(hitsPerPage, true);
--- a/dependency-check-core/src/test/java/org/owasp/dependencycheck/data/lucene/TokenPairConcatenatingFilterTest.java
+++ b/dependency-check-core/src/test/java/org/owasp/dependencycheck/data/lucene/TokenPairConcatenatingFilterTest.java
@@ -26,7 +26,6 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.junit.After;

 import org.junit.Before;
-import org.junit.Test;

 /**
 *
@@ -55,22 +54,4 @@ public class TokenPairConcatenatingFilterTest extends BaseTokenStreamTestCase {
        assertTokenStreamContents(filter,
                new String[]{"one", "onetwo", "two", "twothree", "three"});
    }
-
-    /**
-     * Test of clear method, of class TokenPairConcatenatingFilter.
-     *
-     * @throws java.io.IOException
-     */
-    @Test
-    public void testClear() throws IOException {
-
-        TokenStream ts = new WhitespaceTokenizer(LuceneUtils.CURRENT_VERSION, new StringReader("one two three"));
-        TokenPairConcatenatingFilter filter = new TokenPairConcatenatingFilter(ts);
-        assertTokenStreamContents(filter, new String[]{"one", "onetwo", "two", "twothree", "three"});
-
-        assertNotNull(filter.getPreviousWord());
-        filter.clear();
-        assertNull(filter.getPreviousWord());
-        assertTrue(filter.getWords().isEmpty());
-    }
 }