From ec448438e525b7256808106a8248d759972a6f5b Mon Sep 17 00:00:00 2001 From: Jeremy Long Date: Mon, 29 May 2017 08:19:22 -0400 Subject: [PATCH] fix for issue #710 --- .../owasp/dependencycheck/xml/XmlEntity.java | 309 ++++++++++++++++++ .../dependencycheck/xml/XmlInputStream.java | 269 +++++++++++++++ .../dependencycheck/xml/pom/PomParser.java | 4 +- .../dependencycheck/xml/pom/PomUtilsTest.java | 2 +- .../src/test/resources/jmockit-1.26.pom | 2 +- .../it/710-pom-parse-error/postbuild.groovy | 17 + src/main/config/checkstyle-suppressions.xml | 2 + 7 files changed, 602 insertions(+), 3 deletions(-) create mode 100644 dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlEntity.java create mode 100644 dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlInputStream.java create mode 100644 dependency-check-maven/src/it/710-pom-parse-error/postbuild.groovy diff --git a/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlEntity.java b/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlEntity.java new file mode 100644 index 000000000..2da3aacdf --- /dev/null +++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlEntity.java @@ -0,0 +1,309 @@ +package org.owasp.dependencycheck.xml; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * This is a utility class to convert named XML Entities (such as ø) into + * its HTML encoded Unicode code point (i.e. &#248;). This is a slightly + * modified (class/method rename) from an SO answer: + * https://stackoverflow.com/questions/7286428/help-the-java-sax-parser-to-understand-bad-xml + * + * @author https://stackoverflow.com/users/823393/oldcurmudgeon + */ +public class XmlEntity { + + /** + * Private constructor for a utility class. + */ + private XmlEntity() { + } + + /** + * Converts a named XML entity into its HTML encoded Unicode code point. + * + * @param s the named entity (note, this should not include the leading '&' + * or trailing ';' + * @return the HTML encoded Unicode code point representation of the named + * entity + */ + public static String fromNamedReference(CharSequence s) { + if (s == null) { + return null; + } + final Integer code = SPECIALS.get(s.toString()); + if (code != null) { + return "&#" + code + ";"; + } + return null; + } + + /** + * The map of HTML entities. + */ + private static final Map SPECIALS; + + /** + * Create a map HTML Named Entities to their numeric equivalent. Derived + * from Wikipedia + * http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references + */ + static { + final Map map = new HashMap<>(); + map.put("quot", 34); + map.put("amp", 38); + map.put("apos", 39); + map.put("lt", 60); + map.put("gt", 62); + map.put("nbsp", 160); + map.put("iexcl", 161); + map.put("cent", 162); + map.put("pound", 163); + map.put("curren", 164); + map.put("yen", 165); + map.put("brvbar", 166); + map.put("sect", 167); + map.put("uml", 168); + map.put("copy", 169); + map.put("ordf", 170); + map.put("laquo", 171); + map.put("not", 172); + map.put("shy", 173); + map.put("reg", 174); + map.put("macr", 175); + map.put("deg", 176); + map.put("plusmn", 177); + map.put("sup2", 178); + map.put("sup3", 179); + map.put("acute", 180); + map.put("micro", 181); + map.put("para", 182); + map.put("middot", 183); + map.put("cedil", 184); + map.put("sup1", 185); + map.put("ordm", 186); + map.put("raquo", 187); + map.put("frac14", 188); + map.put("frac12", 189); + map.put("frac34", 190); + map.put("iquest", 191); + map.put("Agrave", 192); + map.put("Aacute", 193); + map.put("Acirc", 194); + map.put("Atilde", 195); + map.put("Auml", 196); + map.put("Aring", 197); + map.put("AElig", 198); + map.put("Ccedil", 199); + map.put("Egrave", 200); + map.put("Eacute", 201); + map.put("Ecirc", 202); + map.put("Euml", 203); + map.put("Igrave", 204); + map.put("Iacute", 205); + map.put("Icirc", 206); + map.put("Iuml", 207); + map.put("ETH", 208); + map.put("Ntilde", 209); + map.put("Ograve", 210); + map.put("Oacute", 211); + map.put("Ocirc", 212); + map.put("Otilde", 213); + map.put("Ouml", 214); + map.put("times", 215); + map.put("Oslash", 216); + map.put("Ugrave", 217); + map.put("Uacute", 218); + map.put("Ucirc", 219); + map.put("Uuml", 220); + map.put("Yacute", 221); + map.put("THORN", 222); + map.put("szlig", 223); + map.put("agrave", 224); + map.put("aacute", 225); + map.put("acirc", 226); + map.put("atilde", 227); + map.put("auml", 228); + map.put("aring", 229); + map.put("aelig", 230); + map.put("ccedil", 231); + map.put("egrave", 232); + map.put("eacute", 233); + map.put("ecirc", 234); + map.put("euml", 235); + map.put("igrave", 236); + map.put("iacute", 237); + map.put("icirc", 238); + map.put("iuml", 239); + map.put("eth", 240); + map.put("ntilde", 241); + map.put("ograve", 242); + map.put("oacute", 243); + map.put("ocirc", 244); + map.put("otilde", 245); + map.put("ouml", 246); + map.put("divide", 247); + map.put("oslash", 248); + map.put("ugrave", 249); + map.put("uacute", 250); + map.put("ucirc", 251); + map.put("uuml", 252); + map.put("yacute", 253); + map.put("thorn", 254); + map.put("yuml", 255); + map.put("OElig", 338); + map.put("oelig", 339); + map.put("Scaron", 352); + map.put("scaron", 353); + map.put("Yuml", 376); + map.put("fnof", 402); + map.put("circ", 710); + map.put("tilde", 732); + map.put("Alpha", 913); + map.put("Beta", 914); + map.put("Gamma", 915); + map.put("Delta", 916); + map.put("Epsilon", 917); + map.put("Zeta", 918); + map.put("Eta", 919); + map.put("Theta", 920); + map.put("Iota", 921); + map.put("Kappa", 922); + map.put("Lambda", 923); + map.put("Mu", 924); + map.put("Nu", 925); + map.put("Xi", 926); + map.put("Omicron", 927); + map.put("Pi", 928); + map.put("Rho", 929); + map.put("Sigma", 931); + map.put("Tau", 932); + map.put("Upsilon", 933); + map.put("Phi", 934); + map.put("Chi", 935); + map.put("Psi", 936); + map.put("Omega", 937); + map.put("alpha", 945); + map.put("beta", 946); + map.put("gamma", 947); + map.put("delta", 948); + map.put("epsilon", 949); + map.put("zeta", 950); + map.put("eta", 951); + map.put("theta", 952); + map.put("iota", 953); + map.put("kappa", 954); + map.put("lambda", 955); + map.put("mu", 956); + map.put("nu", 957); + map.put("xi", 958); + map.put("omicron", 959); + map.put("pi", 960); + map.put("rho", 961); + map.put("sigmaf", 962); + map.put("sigma", 963); + map.put("tau", 964); + map.put("upsilon", 965); + map.put("phi", 966); + map.put("chi", 967); + map.put("psi", 968); + map.put("omega", 969); + map.put("thetasym", 977); + map.put("upsih", 978); + map.put("piv", 982); + map.put("ensp", 8194); + map.put("emsp", 8195); + map.put("thinsp", 8201); + map.put("zwnj", 8204); + map.put("zwj", 8205); + map.put("lrm", 8206); + map.put("rlm", 8207); + map.put("ndash", 8211); + map.put("mdash", 8212); + map.put("lsquo", 8216); + map.put("rsquo", 8217); + map.put("sbquo", 8218); + map.put("ldquo", 8220); + map.put("rdquo", 8221); + map.put("bdquo", 8222); + map.put("dagger", 8224); + map.put("Dagger", 8225); + map.put("bull", 8226); + map.put("hellip", 8230); + map.put("permil", 8240); + map.put("prime", 8242); + map.put("Prime", 8243); + map.put("lsaquo", 8249); + map.put("rsaquo", 8250); + map.put("oline", 8254); + map.put("frasl", 8260); + map.put("euro", 8364); + map.put("image", 8465); + map.put("weierp", 8472); + map.put("real", 8476); + map.put("trade", 8482); + map.put("alefsym", 8501); + map.put("larr", 8592); + map.put("uarr", 8593); + map.put("rarr", 8594); + map.put("darr", 8595); + map.put("harr", 8596); + map.put("crarr", 8629); + map.put("lArr", 8656); + map.put("uArr", 8657); + map.put("rArr", 8658); + map.put("dArr", 8659); + map.put("hArr", 8660); + map.put("forall", 8704); + map.put("part", 8706); + map.put("exist", 8707); + map.put("empty", 8709); + map.put("nabla", 8711); + map.put("isin", 8712); + map.put("notin", 8713); + map.put("ni", 8715); + map.put("prod", 8719); + map.put("sum", 8721); + map.put("minus", 8722); + map.put("lowast", 8727); + map.put("radic", 8730); + map.put("prop", 8733); + map.put("infin", 8734); + map.put("ang", 8736); + map.put("and", 8743); + map.put("or", 8744); + map.put("cap", 8745); + map.put("cup", 8746); + map.put("int", 8747); + map.put("there4", 8756); + map.put("sim", 8764); + map.put("cong", 8773); + map.put("asymp", 8776); + map.put("ne", 8800); + map.put("equiv", 8801); + map.put("le", 8804); + map.put("ge", 8805); + map.put("sub", 8834); + map.put("sup", 8835); + map.put("nsub", 8836); + map.put("sube", 8838); + map.put("supe", 8839); + map.put("oplus", 8853); + map.put("otimes", 8855); + map.put("perp", 8869); + map.put("sdot", 8901); + map.put("lceil", 8968); + map.put("rceil", 8969); + map.put("lfloor", 8970); + map.put("rfloor", 8971); + map.put("lang", 10216); + map.put("rang", 10217); + map.put("loz", 9674); + map.put("spades", 9824); + map.put("clubs", 9827); + map.put("hearts", 9829); + map.put("diams", 9830); + SPECIALS = Collections.unmodifiableMap(map); + } +} diff --git a/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlInputStream.java b/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlInputStream.java new file mode 100644 index 000000000..2879c2baf --- /dev/null +++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/XmlInputStream.java @@ -0,0 +1,269 @@ +package org.owasp.dependencycheck.xml; + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Cleans up often very bad XML. Primarily, this will convert named HTM entities + * into their HTM encoded Unicode code point representation. + * + *
    + *
  1. Strips leading white space
  2. + *
  3. Recodes &pound; etc to &#...;
  4. + *
  5. Recodes lone & as &amp;
  6. + *
+ *

+ * This is a slightly modified (class/method rename) from an SO answer: + * https://stackoverflow.com/questions/7286428/help-the-java-sax-parser-to-understand-bad-xml

+ * + * @author https://stackoverflow.com/users/823393/oldcurmudgeon + */ +public class XmlInputStream extends FilterInputStream { + + /** + * The logger. + */ + private static final Logger LOGGER = LoggerFactory.getLogger(XmlInputStream.class); + /** + * The minimum length of characters to read. + */ + private static final int MIN_LENGTH = 2; + /** + * Holder for everything we've read. + */ + private StringBuilder red = new StringBuilder(); + /** + * Data that needs to be pushed back. + */ + private StringBuilder pushBack = new StringBuilder(); + /** + * How much we've given them. + */ + private int given = 0; + /** + * How much we've read. + */ + private int pulled = 0; + + /** + * Constructs a new XML Input Stream. + * + * @param in the base input stream + */ + public XmlInputStream(InputStream in) { + super(in); + } + + /** + * NB: This is a Troll length (i.e. it goes 1, 2, many) so 2 actually means + * "at least 2" + * + * @return the length + */ + public int length() { + try { + final StringBuilder s = read(MIN_LENGTH); + pushBack.append(s); + return s.length(); + } catch (IOException ex) { + LOGGER.warn("Oops ", ex); + } + return 0; + } + + /** + * Read n characters. + * + * @param n the number of characters to read + * @return the characters read + * @throws IOException thrown when an error occurs + */ + private StringBuilder read(int n) throws IOException { + // Input stream finished? + boolean eof = false; + // Read that many. + final StringBuilder s = new StringBuilder(n); + while (s.length() < n && !eof) { + // Always get from the pushBack buffer. + if (pushBack.length() == 0) { + // Read something from the stream into pushBack. + eof = readIntoPushBack(); + } + + // Pushback only contains deliverable codes. + if (pushBack.length() > 0) { + // Grab one character + s.append(pushBack.charAt(0)); + // Remove it from pushBack + pushBack.deleteCharAt(0); + } + + } + return s; + } + + /** + * Might not actually push back anything but usually will. + * + * @return true if at end-of-file + * @throws IOException thrown if there is an IO exception in the underlying + * steam + */ + private boolean readIntoPushBack() throws IOException { + // File finished? + boolean eof = false; + // Next char. + final int ch = in.read(); + if (ch >= 0) { + // Discard whitespace at start? + if (!(pulled == 0 && isWhiteSpace(ch))) { + // Good code. + pulled += 1; + // Parse out the &stuff; + if (ch == '&') { + // Process the & + readAmpersand(); + } else { + // Not an '&', just append. + pushBack.append((char) ch); + } + } + } else { + // Hit end of file. + eof = true; + } + return eof; + } + + /** + * Deal with an ampersand in the stream. + * + * @throws IOException thrown if an unknown entity is encountered + */ + private void readAmpersand() throws IOException { + // Read the whole word, up to and including the ; + final StringBuilder reference = new StringBuilder(); + int ch; + // Should end in a ';' + for (ch = in.read(); isAlphaNumeric(ch); ch = in.read()) { + reference.append((char) ch); + } + // Did we tidily finish? + if (ch == ';') { + // Yes! Translate it into a &#nnn; code. + final String code = XmlEntity.fromNamedReference(reference); + if (code != null) { + // Keep it. + pushBack.append(code); + } else { + throw new IOException("Invalid/Unknown reference '&" + reference + ";'"); + } + } else { + // Did not terminate properly! + // Perhaps an & on its own or a malformed reference. + // Either way, escape the & + pushBack.append("&").append(reference).append((char) ch); + } + } + + /** + * Keep track of what we've given them. + * + * @param s the sequence of characters given + * @param wanted the number of characters wanted + * @param got the number of characters given + */ + private void given(CharSequence s, int wanted, int got) { + red.append(s); + given += got; + LOGGER.trace("Given: [" + wanted + "," + got + "]-" + s); + } + + /** + * Reads the next byte. + * + * @return the byte read + * @throws IOException thrown when there is an problem reading + */ + @Override + public int read() throws IOException { + final StringBuilder s = read(1); + given(s, 1, 1); + return s.length() > 0 ? s.charAt(0) : -1; + } + + /** + * Reads the next length of bytes from the stream into the given byte array + * at the given offset. + * + * @param data the buffer to store the data read + * @param offset the offset in the buffer to start writing + * @param length the length of data to read + * @return the number of bytes read + * @throws IOException thrown when there is an issue with the underlying + * stream + */ + @Override + public int read(byte[] data, int offset, int length) throws IOException { + final StringBuilder s = read(length); + int n = 0; + for (int i = 0; i < Math.min(length, s.length()); i++) { + data[offset + i] = (byte) s.charAt(i); + n += 1; + } + given(s, length, n); + return n > 0 ? n : -1; + } + + /** + * To string implementation. + * + * @return a string representation of the data given and read from the + * stream. + */ + @Override + public String toString() { + final String s = red.toString(); + final StringBuilder h = new StringBuilder(); + // Hex dump the small ones. + if (s.length() < 8) { + for (int i = 0; i < s.length(); i++) { + h.append(" ").append(Integer.toHexString(s.charAt(i))); + } + } + return "[" + given + "]-\"" + s + "\"" + (h.length() > 0 ? " (" + h.toString() + ")" : ""); + } + + /** + * Determines if the character is whitespace. + * + * @param ch the character to check + * @return true if the character is whitespace; otherwise false + */ + private boolean isWhiteSpace(int ch) { + switch (ch) { + case ' ': + case '\r': + case '\n': + case '\t': + return true; + default: + return false; + } + } + + /** + * Checks if the given character is alpha-numeric. + * + * @param ch the character to check + * @return true if the character is alpha-numeric; otherwise false. + */ + private boolean isAlphaNumeric(int ch) { + return ('a' <= ch && ch <= 'z') + || ('A' <= ch && ch <= 'Z') + || ('0' <= ch && ch <= '9'); + } +} diff --git a/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/pom/PomParser.java b/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/pom/PomParser.java index 8de5bcb2e..155b0048a 100644 --- a/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/pom/PomParser.java +++ b/dependency-check-core/src/main/java/org/owasp/dependencycheck/xml/pom/PomParser.java @@ -29,6 +29,7 @@ import javax.xml.parsers.SAXParser; import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.BOMInputStream; import org.owasp.dependencycheck.utils.XmlUtils; +import org.owasp.dependencycheck.xml.XmlInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -80,7 +81,8 @@ public class PomParser { final SAXParser saxParser = XmlUtils.buildSecureSaxParser(); final XMLReader xmlReader = saxParser.getXMLReader(); xmlReader.setContentHandler(handler); - final BOMInputStream bomStream = new BOMInputStream(inputStream); + + final BOMInputStream bomStream = new BOMInputStream(new XmlInputStream(inputStream)); final ByteOrderMark bom = bomStream.getBOM(); final String defaultEncoding = "UTF-8"; final String charsetName = bom == null ? defaultEncoding : bom.getCharsetName(); diff --git a/dependency-check-core/src/test/java/org/owasp/dependencycheck/xml/pom/PomUtilsTest.java b/dependency-check-core/src/test/java/org/owasp/dependencycheck/xml/pom/PomUtilsTest.java index 3b5c285bf..879e1217f 100644 --- a/dependency-check-core/src/test/java/org/owasp/dependencycheck/xml/pom/PomUtilsTest.java +++ b/dependency-check-core/src/test/java/org/owasp/dependencycheck/xml/pom/PomUtilsTest.java @@ -49,7 +49,7 @@ public class PomUtilsTest extends BaseTest { assertEquals(expResult, result.getOrganizationUrl()); file = BaseTest.getResourceAsFile(this, "jmockit-1.26.pom"); - expResult = "Main"; + expResult = "Main ΓΈ modified to test issue #710"; result = PomUtils.readPom(file); assertEquals(expResult, result.getName()); } diff --git a/dependency-check-core/src/test/resources/jmockit-1.26.pom b/dependency-check-core/src/test/resources/jmockit-1.26.pom index 590d69b63..3faac78be 100644 --- a/dependency-check-core/src/test/resources/jmockit-1.26.pom +++ b/dependency-check-core/src/test/resources/jmockit-1.26.pom @@ -7,7 +7,7 @@ org.jmockitjmockit1.26 jar - Main + Main ø modified to test issue #710 JMockit is a Java toolkit for automated developer testing. It contains mocking and faking APIs and a code coverage tool, supporting both JUnit and TestNG. diff --git a/dependency-check-maven/src/it/710-pom-parse-error/postbuild.groovy b/dependency-check-maven/src/it/710-pom-parse-error/postbuild.groovy new file mode 100644 index 000000000..9ec3a0a91 --- /dev/null +++ b/dependency-check-maven/src/it/710-pom-parse-error/postbuild.groovy @@ -0,0 +1,17 @@ +/* + * This file is part of dependency-check-maven. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright (c) 2017 Jeremy Long. All Rights Reserved. + */ diff --git a/src/main/config/checkstyle-suppressions.xml b/src/main/config/checkstyle-suppressions.xml index 56ec1750f..46acbbaf7 100644 --- a/src/main/config/checkstyle-suppressions.xml +++ b/src/main/config/checkstyle-suppressions.xml @@ -12,4 +12,6 @@ + + \ No newline at end of file