From c23febbcf05ae1fb0eb83adb882d387b0bdfc514 Mon Sep 17 00:00:00 2001 From: Jeremy Long Date: Sun, 23 Jun 2013 22:29:54 -0400 Subject: [PATCH] added UrlTokenizingFilter Former-commit-id: 6868a5b16e8d44f8761028278b6c292f98f53a7b --- .../data/lucene/UrlTokenizingFilter.java | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 src/main/java/org/owasp/dependencycheck/data/lucene/UrlTokenizingFilter.java diff --git a/src/main/java/org/owasp/dependencycheck/data/lucene/UrlTokenizingFilter.java b/src/main/java/org/owasp/dependencycheck/data/lucene/UrlTokenizingFilter.java new file mode 100644 index 000000000..f3f8e2dd6 --- /dev/null +++ b/src/main/java/org/owasp/dependencycheck/data/lucene/UrlTokenizingFilter.java @@ -0,0 +1,85 @@ +/* + * This file is part of Dependency-Check. + * + * Dependency-Check is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * Dependency-Check is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * Dependency-Check. If not, see http://www.gnu.org/licenses/. + * + * Copyright (c) 2013 Jeremy Long. All Rights Reserved. + */ +package org.owasp.dependencycheck.data.lucene; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.LinkedList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.owasp.dependencycheck.utils.UrlStringUtils; + +/** + *

Takes a TokenStream and splits or adds tokens to correctly index version + * numbers.

+ *

Example: "3.0.0.RELEASE" -> "3 3.0 3.0.0 RELEASE + * 3.0.0.RELEASE".

+ * + * @author Jeremy Long (jeremy.long@owasp.org) + */ +public final class UrlTokenizingFilter extends AbstractTokenizingFilter { + + /** + * Constructs a new VersionTokenizingFilter. + * + * @param stream the TokenStream that this filter will process + */ + public UrlTokenizingFilter(TokenStream stream) { + super(stream); + } + + /** + * Increments the underlying TokenStream and sets CharTermAttributes to + * construct an expanded set of tokens by concatenating tokens with the + * previous token. + * + * @return whether or not we have hit the end of the TokenStream + * @throws IOException is thrown when an IOException occurs + */ + @Override + public boolean incrementToken() throws IOException { + final LinkedList tokens = getTokens(); + final CharTermAttribute termAtt = getTermAtt(); + if (tokens.size() == 0 && input.incrementToken()) { + final String text = new String(termAtt.buffer(), 0, termAtt.length()); + if (UrlStringUtils.containsUrl(text)) { + final String[] parts = text.split("\\s"); + for (String part : parts) { + if (UrlStringUtils.isUrl(part)) { + try { + final List data = UrlStringUtils.extractImportantUrlData(part); + tokens.addAll(data); + } catch (MalformedURLException ex) { + Logger.getLogger(UrlTokenizingFilter.class.getName()).log(Level.INFO, "error parsing " + part, ex); + tokens.add(part); + } + } else { + tokens.add(part); + } + } + } else { + tokens.add(text); + } + } + return addTerm(); + } +}