From 601637051501a0a88c2abc28e541b2c95e95e56b Mon Sep 17 00:00:00 2001 From: Jeremy Long Date: Sun, 23 Jun 2013 22:38:45 -0400 Subject: [PATCH] Added methods to test strings for urls and to extract important portions of the url Former-commit-id: be5878b61d1ce226a4d802b431cfa6c0d32252d1 --- .../dependencycheck/utils/UrlStringUtils.java | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 src/main/java/org/owasp/dependencycheck/utils/UrlStringUtils.java diff --git a/src/main/java/org/owasp/dependencycheck/utils/UrlStringUtils.java b/src/main/java/org/owasp/dependencycheck/utils/UrlStringUtils.java new file mode 100644 index 000000000..716efa3ba --- /dev/null +++ b/src/main/java/org/owasp/dependencycheck/utils/UrlStringUtils.java @@ -0,0 +1,104 @@ +/* + * This file is part of Dependency-Check. + * + * Dependency-Check is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * Dependency-Check is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * Dependency-Check. If not, see http://www.gnu.org/licenses/. + * + * Copyright (c) 2013 Jeremy Long. All Rights Reserved. + */ +package org.owasp.dependencycheck.utils; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * + * @author Jeremy Long (jeremy.long@gmail.com) + */ +public final class UrlStringUtils { + + /** + * Private constructor for a utility class. + */ + private UrlStringUtils() { + } + /** + * A regular expression to test if a string contains a URL. + */ + private static final Pattern CONTAINS_URL_TEST = Pattern.compile("^.*(ht|f)tps?://.*$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); + /** + * A regular expression to test if a string is a URL. + */ + private static final Pattern IS_URL_TEST = Pattern.compile("^(ht|f)tps?://.*", Pattern.CASE_INSENSITIVE); + + /** + * Tests if the text provided contains a URL. This is somewhat limited + * search in that it only looks for (ftp|http|https):// + * + * @param text the text to search + * @return true if the text contains a url, otherwise false + */ + public static boolean containsUrl(String text) { + return CONTAINS_URL_TEST.matcher(text).matches(); + } + + /** + * Tests if the given text is url. + * + * @param text the string to test + * @return returns true if the text is a url, otherwise false + */ + public static boolean isUrl(String text) { + return IS_URL_TEST.matcher(text).matches(); + } + + /** + *

Takes a URL, in String format, and adds the important parts of the URL + * to a list of strings.

+ *

Example, given the following input:

+ * "https://www.somedomain.com/path1/path2/file.php?id=439" + *

The function would return:

+ * {"somedomain", "path1", "path2", "file"} + * + * @param text a URL + * @return importantParts a list of the important parts of the URL + * @throws MalformedURLException thrown if the URL is malformed + */ + public static List extractImportantUrlData(String text) throws MalformedURLException { + final ArrayList importantParts = new ArrayList(); + final URL url = new URL(text); + final String[] domain = url.getHost().split("\\."); + //add the domain except www and the tld. + for (int i = 0; i < domain.length - 1; i++) { + final String sub = domain[i]; + if (!"www".equalsIgnoreCase(sub)) { + importantParts.add(sub); + } + } + final String document = url.getPath(); + final String[] pathParts = document.split("[\\//]"); + for (int i = 0; i < pathParts.length - 2; i++) { + if (!pathParts[i].isEmpty()) { + importantParts.add(pathParts[i]); + } + } + if (!pathParts[pathParts.length - 1].isEmpty()) { + final String fileNameNoExt = pathParts[pathParts.length - 1].replaceAll("\\..*{0,5}$", ""); + importantParts.add(fileNameNoExt); + } + return importantParts; + } +}