Added methods to test strings for urls and to extract important portions of the url

Former-commit-id: be5878b61d1ce226a4d802b431cfa6c0d32252d1
2026-03-27 19:41:38 +01:00 · 2013-06-23 22:38:45 -04:00
parent f3c026f278
commit 6016370515
1 changed files with 104 additions and 0 deletions
--- a/src/main/java/org/owasp/dependencycheck/utils/UrlStringUtils.java
+++ b/src/main/java/org/owasp/dependencycheck/utils/UrlStringUtils.java
@@ -0,0 +1,104 @@
 /*
 * This file is part of Dependency-Check.
 *
 * Dependency-Check is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 *
 * Dependency-Check is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * Dependency-Check. If not, see http://www.gnu.org/licenses/.
 *
 * Copyright (c) 2013 Jeremy Long. All Rights Reserved.
 */
 package org.owasp.dependencycheck.utils;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;
 /**
 *
 * @author Jeremy Long (jeremy.long@gmail.com)
 */
 public final class UrlStringUtils {
    /**
     * Private constructor for a utility class.
     */
    private UrlStringUtils() {
    }
    /**
     * A regular expression to test if a string contains a URL.
     */
    private static final Pattern CONTAINS_URL_TEST = Pattern.compile("^.*(ht|f)tps?://.*$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
    /**
     * A regular expression to test if a string is a URL.
     */
    private static final Pattern IS_URL_TEST = Pattern.compile("^(ht|f)tps?://.*", Pattern.CASE_INSENSITIVE);
    /**
     * Tests if the text provided contains a URL. This is somewhat limited
     * search in that it only looks for (ftp|http|https)://
     *
     * @param text the text to search
     * @return true if the text contains a url, otherwise false
     */
    public static boolean containsUrl(String text) {
        return CONTAINS_URL_TEST.matcher(text).matches();
    }
    /**
     * Tests if the given text is url.
     *
     * @param text the string to test
     * @return returns true if the text is a url, otherwise false
     */
    public static boolean isUrl(String text) {
        return IS_URL_TEST.matcher(text).matches();
    }
    /**
     * <p>Takes a URL, in String format, and adds the important parts of the URL
     * to a list of strings.</p>
     * <p>Example, given the following input:</p>
     * <code>"https://www.somedomain.com/path1/path2/file.php?id=439"</code>
     * <p>The function would return:</p>
     * <code>{"somedomain", "path1", "path2", "file"}</code>
     *
     * @param text a URL
     * @return importantParts a list of the important parts of the URL
     * @throws MalformedURLException thrown if the URL is malformed
     */
    public static List<String> extractImportantUrlData(String text) throws MalformedURLException {
        final ArrayList<String> importantParts = new ArrayList<String>();
        final URL url = new URL(text);
        final String[] domain = url.getHost().split("\\.");
        //add the domain except www and the tld.
        for (int i = 0; i < domain.length - 1; i++) {
            final String sub = domain[i];
            if (!"www".equalsIgnoreCase(sub)) {
                importantParts.add(sub);
            }
        }
        final String document = url.getPath();
        final String[] pathParts = document.split("[\\//]");
        for (int i = 0; i < pathParts.length - 2; i++) {
            if (!pathParts[i].isEmpty()) {
                importantParts.add(pathParts[i]);
            }
        }
        if (!pathParts[pathParts.length - 1].isEmpty()) {
            final String fileNameNoExt = pathParts[pathParts.length - 1].replaceAll("\\..*{0,5}$", "");
            importantParts.add(fileNameNoExt);
        }
        return importantParts;
    }
 }