View Javadoc
1   /*
2    * This file is part of dependency-check-core.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   * Copyright (c) 2013 Jeremy Long. All Rights Reserved.
17   */
18  package org.owasp.dependencycheck.utils;
19  
20  import java.net.MalformedURLException;
21  import java.net.URL;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.HashSet;
25  import java.util.List;
26  import java.util.Set;
27  import java.util.regex.Pattern;
28  
29  /**
30   *
31   * @author Jeremy Long
32   */
33  public final class UrlStringUtils {
34  
35      /**
36       * Private constructor for a utility class.
37       */
38      private UrlStringUtils() {
39      }
40      /**
41       * A regular expression to test if a string contains a URL.
42       */
43      private static final Pattern CONTAINS_URL_TEST = Pattern.compile("^.*(ht|f)tps?://.*$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
44      /**
45       * A regular expression to test if a string is a URL.
46       */
47      private static final Pattern IS_URL_TEST = Pattern.compile("^(ht|f)tps?://.*", Pattern.CASE_INSENSITIVE);
48  
49      /**
50       * Tests if the text provided contains a URL. This is somewhat limited search in that it only looks for
51       * (ftp|http|https)://
52       *
53       * @param text the text to search
54       * @return true if the text contains a url, otherwise false
55       */
56      public static boolean containsUrl(String text) {
57          return CONTAINS_URL_TEST.matcher(text).matches();
58      }
59  
60      /**
61       * Tests if the given text is url.
62       *
63       * @param text the string to test
64       * @return returns true if the text is a url, otherwise false
65       */
66      public static boolean isUrl(String text) {
67          return IS_URL_TEST.matcher(text).matches();
68      }
69      /**
70       * A listing of domain parts that should not be used as evidence. Yes, this is an incomplete list.
71       */
72      private static final Set<String> IGNORE_LIST = new HashSet<String>(
73              Arrays.asList("www", "com", "org", "gov", "info", "name", "net", "pro", "tel", "mobi", "xxx"));
74  
75      /**
76       * <p>
77       * Takes a URL, in String format, and adds the important parts of the URL to a list of strings.</p>
78       * <p>
79       * Example, given the following input:</p>
80       * <code>"https://www.somedomain.com/path1/path2/file.php?id=439"</code>
81       * <p>
82       * The function would return:</p>
83       * <code>{"some.domain", "path1", "path2", "file"}</code>
84       *
85       * @param text a URL
86       * @return importantParts a list of the important parts of the URL
87       * @throws MalformedURLException thrown if the URL is malformed
88       */
89      public static List<String> extractImportantUrlData(String text) throws MalformedURLException {
90          final List<String> importantParts = new ArrayList<String>();
91          final URL url = new URL(text);
92          final String[] domain = url.getHost().split("\\.");
93          //add the domain except www and the tld.
94          for (int i = 0; i < domain.length - 1; i++) {
95              final String sub = domain[i];
96              if (!IGNORE_LIST.contains(sub.toLowerCase())) {
97                  importantParts.add(sub);
98              }
99          }
100         final String document = url.getPath();
101         final String[] pathParts = document.split("[\\//]");
102         for (int i = 0; i < pathParts.length - 2; i++) {
103             if (!pathParts[i].isEmpty()) {
104                 importantParts.add(pathParts[i]);
105             }
106         }
107         if (pathParts.length > 0 && !pathParts[pathParts.length - 1].isEmpty()) {
108             final String fileNameNoExt = pathParts[pathParts.length - 1].replaceAll("\\..*{0,5}$", "");
109             importantParts.add(fileNameNoExt);
110         }
111         return importantParts;
112     }
113 }