1 /*
2 * This file is part of dependency-check-core.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 * Copyright (c) 2013 Jeremy Long. All Rights Reserved.
17 */
18 package org.owasp.dependencycheck.utils;
19
20 import java.net.MalformedURLException;
21 import java.net.URL;
22 import java.util.ArrayList;
23 import java.util.Arrays;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Set;
27 import java.util.regex.Pattern;
28
29 /**
30 *
31 * @author Jeremy Long
32 */
33 public final class UrlStringUtils {
34
35 /**
36 * Private constructor for a utility class.
37 */
38 private UrlStringUtils() {
39 }
40 /**
41 * A regular expression to test if a string contains a URL.
42 */
43 private static final Pattern CONTAINS_URL_TEST = Pattern.compile("^.*(ht|f)tps?://.*$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
44 /**
45 * A regular expression to test if a string is a URL.
46 */
47 private static final Pattern IS_URL_TEST = Pattern.compile("^(ht|f)tps?://.*", Pattern.CASE_INSENSITIVE);
48
49 /**
50 * Tests if the text provided contains a URL. This is somewhat limited search in that it only looks for
51 * (ftp|http|https)://
52 *
53 * @param text the text to search
54 * @return true if the text contains a url, otherwise false
55 */
56 public static boolean containsUrl(String text) {
57 return CONTAINS_URL_TEST.matcher(text).matches();
58 }
59
60 /**
61 * Tests if the given text is url.
62 *
63 * @param text the string to test
64 * @return returns true if the text is a url, otherwise false
65 */
66 public static boolean isUrl(String text) {
67 return IS_URL_TEST.matcher(text).matches();
68 }
69 /**
70 * A listing of domain parts that should not be used as evidence. Yes, this is an incomplete list.
71 */
72 private static final Set<String> IGNORE_LIST = new HashSet<String>(
73 Arrays.asList("www", "com", "org", "gov", "info", "name", "net", "pro", "tel", "mobi", "xxx"));
74
75 /**
76 * <p>
77 * Takes a URL, in String format, and adds the important parts of the URL to a list of strings.</p>
78 * <p>
79 * Example, given the following input:</p>
80 * <code>"https://www.somedomain.com/path1/path2/file.php?id=439"</code>
81 * <p>
82 * The function would return:</p>
83 * <code>{"some.domain", "path1", "path2", "file"}</code>
84 *
85 * @param text a URL
86 * @return importantParts a list of the important parts of the URL
87 * @throws MalformedURLException thrown if the URL is malformed
88 */
89 public static List<String> extractImportantUrlData(String text) throws MalformedURLException {
90 final List<String> importantParts = new ArrayList<String>();
91 final URL url = new URL(text);
92 final String[] domain = url.getHost().split("\\.");
93 //add the domain except www and the tld.
94 for (int i = 0; i < domain.length - 1; i++) {
95 final String sub = domain[i];
96 if (!IGNORE_LIST.contains(sub.toLowerCase())) {
97 importantParts.add(sub);
98 }
99 }
100 final String document = url.getPath();
101 final String[] pathParts = document.split("[\\//]");
102 for (int i = 0; i < pathParts.length - 2; i++) {
103 if (!pathParts[i].isEmpty()) {
104 importantParts.add(pathParts[i]);
105 }
106 }
107 if (pathParts.length > 0 && !pathParts[pathParts.length - 1].isEmpty()) {
108 final String fileNameNoExt = pathParts[pathParts.length - 1].replaceAll("\\..*{0,5}$", "");
109 importantParts.add(fileNameNoExt);
110 }
111 return importantParts;
112 }
113 }