Files
odc-analyzer/app/com/ysoft/odc/OdcParser.scala
2017-03-20 16:44:45 +01:00

330 lines
14 KiB
Scala

package com.ysoft.odc
import com.github.nscala_time.time.Imports._
import com.ysoft.memory.ObjectPool
import com.ysoft.odc.Confidence.Confidence
import controllers.ReportInfo
import models.{LibraryType, PlainLibraryIdentifier}
import scala.xml._
final case class SerializableXml private (xmlString: String) extends Serializable{
def xml = SecureXml.loadString(xmlString) // TODO: cache
override def equals(obj: scala.Any): Boolean = obj match {
case SerializableXml(s/*, _*/) => s == this.xmlString
case other => false
}
override def hashCode(): Int = 42+xmlString.hashCode
}
object SerializableXml{
def apply(xml: Node): SerializableXml = SerializableXml(xml.toString())
def apply(xml: NodeSeq): SerializableXml = SerializableXml(xml.toString())
}
final case class Analysis(scanInfo: SerializableXml, name: String, reportDate: DateTime, dependencies: Seq[Dependency])
final case class Hashes(sha1: String, md5: String){
override def toString: String = s"Hashes(sha1=$sha1, md5=$md5)"
}
final case class Exclusion(sha1: String) extends AnyVal {
def matches(dependency: Dependency): Boolean = dependency.sha1 == sha1
def matches(group: GroupedDependency): Boolean = group.sha1 == sha1
}
final case class Evidence(source: String, name: String, value: String, confidence: String, evidenceType: String)
final case class Dependency(
fileName: String,
filePath: String,
md5: String,
sha1: String,
description: String,
evidenceCollected: Set[Evidence],
identifiers: Seq[Identifier],
suppressedIdentifiers: Seq[Identifier],
license: String,
vulnerabilities: Seq[Vulnerability],
suppressedVulnerabilities: Seq[Vulnerability],
relatedDependencies: SerializableXml
){
def hashes = Hashes(sha1 = sha1, md5 = md5)
def plainLibraryIdentifiers: Set[PlainLibraryIdentifier] = identifiers.flatMap(_.toLibraryIdentifierOption).toSet
/*
Method equals seems to be a CPU hog there. I am not sure if we can do something reasonable about it.
We can compare by this.hashes, but, in such case, dependencies that differ in evidence will be considered the same if their JAR hashes are the same, which would break some sanity checks.
*/
}
/**
* A group of dependencies having the same fingerprints
*
* @param dependencies
*/
final case class GroupedDependency(dependencies: Map[Dependency, Set[ReportInfo]]) {
def parsedDescriptions: Seq[Seq[Seq[String]]] = descriptions.toSeq.sorted.map(_.trim.split("\n\n").filterNot(_=="").toSeq.map(_.split("\n").toSeq))
def isVulnerable: Boolean = vulnerabilities.nonEmpty
def maxCvssScore = (Seq(None) ++ vulnerabilities.map(_.cvssScore)).max
def ysdssScore = maxCvssScore.map(_ * projects.size)
def descriptions = dependencies.keySet.map(_.description)
def projects = dependencies.values.flatten.toSet
def fileNames = dependencies.keySet.map(_.fileName)
def hashes = dependencies.keys.head.hashes // valid since all deps in a group have the same hashes
val sha1 = hashes.sha1
def identifiers: Set[Identifier] = dependencies.keySet.flatMap(_.identifiers)
def evidenceCollected: Set[Evidence] = dependencies.keySet.flatMap(_.evidenceCollected)
def suppressedIdentifiers: Set[Identifier] = dependencies.keySet.flatMap(_.suppressedIdentifiers)
def mavenIdentifiers = identifiers.filter(_.identifierType == "maven")
def cpeIdentifiers = identifiers.filter(_.identifierType == "cpe")
def vulnerabilities: Set[Vulnerability] = dependencies.keySet.flatMap(_.vulnerabilities)
def suppressedVulnerabilities: Set[Vulnerability] = dependencies.keySet.flatMap(_.suppressedVulnerabilities)
def plainLibraryIdentifiers: Set[PlainLibraryIdentifier] = identifiers.flatMap(_.toLibraryIdentifierOption)
def hasCpe: Boolean = cpeIdentifiers.nonEmpty
def identifiersWithFilenames(threshold: Confidence) = {
def fileNameIdentifiers = fileNames.toIndexedSeq.sorted.map(filename => Identifier(
identifierType = "file",
name = filename,
confidence = Confidence.Highest,
url = ""
))
val identifiersSeq =
if(identifiers.exists(_.confidence >= threshold)) identifiers
else fileNameIdentifiers ++ identifiers // If we don't know any reliable identifier, add filenames
identifiersSeq.toIndexedSeq.sortBy(_.name)
}
}
object GroupedDependency{
private val groupToSet = (_: Seq[(Dependency, ReportInfo)]).map(_._2).toSet // reduces number of lambda instances
def apply(deps: Seq[(Dependency, ReportInfo)]): GroupedDependency = {
GroupedDependency(deps.groupBy(_._1).mapValues(groupToSet))
} // TODO: the groupBy seems to be a CPU hog (because of GroupedDependency.equals); The mapValues is lazy, so its repeated might also be a performance hog, but I doubt that values are used frequently.
}
object Confidence extends Enumeration {
type Confidence = Value
// Order is important
val Low = Value("LOW")
val Medium = Value("MEDIUM")
val High = Value("HIGH")
val Highest = Value("HIGHEST")
}
final case class Reference(source: String, url: String, name: String)
final case class VulnerableSoftware(allPreviousVersion: Boolean, name: String)
final case class CvssRating(score: Option[Double], authenticationr: Option[String], availabilityImpact: Option[String], accessVector: Option[String], integrityImpact: Option[String], accessComplexity: Option[String], confidentialImpact: Option[String])
final case class CWE private(name: String) /*extends AnyVal*/{ // extends AnyVal prevents pooling
override def toString = name
def brief = name.takeWhile(_ != ' ')
def numberOption: Option[Int] = if(brief startsWith "CWE-") try {
Some(brief.substring(4).toInt)
} catch {
case _: NumberFormatException => None
} else None
}
object CWE{
private val cwePool = new ObjectPool()
def forIdentifierWithDescription(name: String) = cwePool(new CWE(name))
}
final case class Vulnerability(name: String, cweOption: Option[CWE], cvss: CvssRating, description: String, vulnerableSoftware: Seq[VulnerableSoftware], references: Seq[Reference]){
def cvssScore = cvss.score
def ysvssScore(affectedDeps: Set[GroupedDependency]) = cvssScore.map(_ * affectedDeps.flatMap(_.projects).toSet.size)
}
final case class Identifier(name: String, confidence: Confidence.Confidence, url: String, identifierType: String) {
def toLibraryIdentifierOption: Option[PlainLibraryIdentifier] = {
if(identifierType == "maven"){
val groupId::artifactId::_ = name.split(':').toList
Some(PlainLibraryIdentifier(libraryType = LibraryType.Maven, libraryIdentifier = s"$groupId:$artifactId"))
}else{
None
}
}
def toCpeIdentifierOption: Option[String] = identifierType match {
case "cpe" => Some(name)
case _ => None
}
//def isClassifiedInSet(set: Set[PlainLibraryIdentifier]): Boolean = toLibraryIdentifierOption.exists(set contains _)
}
object OdcParser {
private val vulnPool = new ObjectPool()
private val evidencePool = new ObjectPool()
private val dependencyPool = new ObjectPool()
private val identifierPool = new ObjectPool()
private val vulnerableSoftwarePool = new ObjectPool()
def filterWhitespace(node: Node) = node.nonEmptyChildren.filter{
case t: scala.xml.Text if t.text.trim == "" => false
case t: scala.xml.PCData if t.text.trim == "" => false
case _ => true
}
def checkElements(node: Node, knownElements: Set[String]) {
val subelementNames = filterWhitespace(node).map(_.label).toSet
val unknownElements = subelementNames -- knownElements
if(unknownElements.nonEmpty){
sys.error("Unknown elements for "+node.label+": "+unknownElements)
}
}
private def getAttributes(data: MetaData): List[String] = data match {
case Null => Nil
case Attribute(key, _, next) => key :: getAttributes(next)
}
def checkParams(node: Node, knownParams: Set[String]) {
val paramNames = getAttributes(node.attributes).toSet
val unknownParams = paramNames -- knownParams
if(unknownParams.nonEmpty){
sys.error("Unknown params for "+node.label+": "+unknownParams)
}
}
def parseVulnerableSoftware(node: Node): VulnerableSoftware = {
checkElements(node, Set("#PCDATA"))
checkParams(node, Set("allPreviousVersion"))
if(node.label != "software"){
sys.error(s"Unexpected element for vulnerableSoftware: ${node.label}")
}
vulnerableSoftwarePool(VulnerableSoftware(
name = node.text,
allPreviousVersion = node.attribute("allPreviousVersion").map(_.text).map(Map("true"->true, "false"->false)).getOrElse(false)
))
}
def parseReference(node: Node): Reference = {
checkElements(node, Set("source", "url", "name"))
checkParams(node, Set())
if(node.label != "reference"){
sys.error(s"Unexpected element for reference: ${node.label}")
}
Reference(
source = (node \ "source").text,
url = (node \ "url").text,
name = (node \ "name").text
)
}
def parseVulnerability(node: Node, expectedLabel: String = "vulnerability"): Vulnerability = {
checkElements(node, Set("name", "severity", "cwe", "cvssScore", "description", "references", "vulnerableSoftware", "cvssAuthenticationr", "cvssAvailabilityImpact", "cvssAccessVector", "cvssIntegrityImpact", "cvssAccessComplexity", "cvssConfidentialImpact"))
if(node.label != expectedLabel){
sys.error(s"Unexpected element for vuln: ${node.label}")
}
def t(ns: NodeSeq) = {
ns match {
case Seq() => None
case Seq(one) =>
one.attributes match {
case Null =>
one.child match {
case Seq(hopefullyTextChild) =>
hopefullyTextChild match {
case Text(data) => Some(data)
}
}
}
}
}
vulnPool(Vulnerability(
name = (node \ "name").text,
//severity = (node \ "severity"), <- severity is useless, as it is computed from cvssScore :D
cweOption = (node \ "cwe").headOption.map(_.text).map(CWE.forIdentifierWithDescription),
description = (node \ "description").text,
cvss = CvssRating(
score = (node \ "cvssScore").headOption.map(_.text.toDouble),
authenticationr = t(node \ "cvssAuthenticationr"),
availabilityImpact = t(node \ "cvssAvailabilityImpact"),
accessVector = t(node \ "cvssAccessVector"),
integrityImpact = t(node \ "cvssIntegrityImpact"),
accessComplexity = t(node \ "cvssAccessComplexity"),
confidentialImpact = t(node \ "cvssConfidentialImpact")
),
references = (node \ "references").flatMap(filterWhitespace).map(parseReference(_)),
vulnerableSoftware = (node \ "vulnerableSoftware").flatMap(filterWhitespace).map(parseVulnerableSoftware)
))
}
def parseIdentifier(node: Node, expectedLabel: String): Identifier = {
if(node.label != expectedLabel){
sys.error("Unexpected label for identifier: "+node.label)
}
checkElements(node, Set("name", "url"))
checkParams(node, Set("type", "confidence"))
val ExtractPattern = """\((.*)\)""".r
identifierPool(Identifier(
name = (node \ "name").text match {
case ExtractPattern(text) => text
},
url = (node \ "url").text,
identifierType = node.attribute("type").get.text,
confidence = Confidence.withName(node.attribute("confidence").get.text)
))
}
def parseDependency(node: Node): Dependency = {
checkElements(node, Set("fileName", "filePath", "md5", "sha1", "description", "evidenceCollected", "identifiers", "license", "vulnerabilities", "relatedDependencies"))
checkParams(node, Set())
val (vulnerabilities: Seq[Node], suppressedVulnerabilities: Seq[Node]) = (node \ "vulnerabilities").headOption.map(filterWhitespace).getOrElse(Seq()).partition(_.label == "vulnerability")
val (identifiers, suppressedIdentifiers) = (node \ "identifiers").headOption.map(filterWhitespace).getOrElse(Seq()).partition(_.label == "identifier")
dependencyPool(Dependency(
fileName = (node \ "fileName").text,
filePath = (node \ "filePath").text,
md5 = (node \ "md5").text,
sha1 = (node \ "sha1").text,
description = (node \ "description").text,
evidenceCollected = filterWhitespace((node \ "evidenceCollected").head).map(parseEvidence).toSet,
identifiers = identifiers.map(parseIdentifier(_, "identifier")),
suppressedIdentifiers = suppressedIdentifiers.map(parseIdentifier(_, "suppressedIdentifier")),
license = (node \ "license").text,
vulnerabilities = vulnerabilities.map(parseVulnerability(_)),
suppressedVulnerabilities = suppressedVulnerabilities.map(parseVulnerability(_, "suppressedVulnerability")),
relatedDependencies = SerializableXml(node \ "relatedDependencies")
))
}
def parseEvidence(node: Node): Evidence = {
if(node.label != "evidence"){
sys.error(s"Unexpected element for evidence: ${node.label}")
}
checkElements(node, Set("source", "name", "value"))
checkParams(node, Set("confidence", "type"))
evidencePool(Evidence(
source = (node \ "source").text,
name = (node \ "name").text,
value = (node \ "value").text,
confidence = node.attribute("confidence").map(_.text).get,
evidenceType = node.attribute("type").map(_.text).get
))
}
def parseDependencies(nodes: NodeSeq): Seq[Dependency] = nodes.map(parseDependency(_))
def parseXmlReport(data: Array[Byte]) = {
val xml = SecureXml.loadString(new String(data, "utf-8"))
Analysis(
scanInfo = SerializableXml((xml \ "scanInfo").head),
name = (xml \ "projectInfo" \ "name").text,
reportDate = DateTime.parse((xml \ "projectInfo" \ "reportDate").text),
dependencies = parseDependencies(xml \ "dependencies" \ "dependency").toIndexedSeq
)
}
}