odc-analyzer/app/com/ysoft/odc/OdcParser.scala

package com.ysoft.odc

import com.github.nscala_time.time.Imports._
import com.ysoft.memory.ObjectPool
import com.ysoft.odc.Confidence.Confidence
import controllers.ReportInfo
import models.{LibraryType, PlainLibraryIdentifier}

import scala.xml._


final case class SerializableXml private (xmlString: String) extends Serializable{
  def xml = SecureXml.loadString(xmlString) // TODO: cache

  override def equals(obj: scala.Any): Boolean = obj match {
    case SerializableXml(s/*, _*/) => s == this.xmlString
    case other => false
  }

  override def hashCode(): Int = 42+xmlString.hashCode

}

object SerializableXml{
  def apply(xml: Node): SerializableXml = SerializableXml(xml.toString())
  def apply(xml: NodeSeq): SerializableXml = SerializableXml(xml.toString())
}

final case class Analysis(scanInfo: SerializableXml, name: String, reportDate: DateTime, dependencies: Seq[Dependency])

final case class Hashes(sha1: String, md5: String){
  override def toString: String = s"Hashes(sha1=$sha1, md5=$md5)"
}

final case class Exclusion(sha1: String) extends AnyVal {
  def matches(dependency: Dependency): Boolean = dependency.sha1 == sha1
  def matches(group: GroupedDependency): Boolean = group.sha1 == sha1
}

final case class Evidence(source: String, name: String, value: String, confidence: String, evidenceType: String)

final case class Dependency(
  fileName: String,
  filePath: String,
  md5: String,
  sha1: String,
  description: String,
  evidenceCollected: Set[Evidence],
  identifiers: Seq[Identifier],
  suppressedIdentifiers: Seq[Identifier],
  license: String,
  vulnerabilities: Seq[Vulnerability],
  suppressedVulnerabilities: Seq[Vulnerability],
  relatedDependencies: SerializableXml
){
  def hashes = Hashes(sha1 = sha1, md5 = md5)

  def plainLibraryIdentifiers: Set[PlainLibraryIdentifier] = identifiers.flatMap(_.toLibraryIdentifierOption).toSet


  /*
  Method equals seems to be a CPU hog there. I am not sure if we can do something reasonable about it.
  We can compare by this.hashes, but, in such case, dependencies that differ in evidence will be considered the same if their JAR hashes are the same, which would break some sanity checks.
   */
}

/**
 * A group of dependencies having the same fingerprints
 *
 * @param dependencies
 */
final case class GroupedDependency(dependencies: Map[Dependency, Set[ReportInfo]]) {
  def parsedDescriptions: Seq[Seq[Seq[String]]] = descriptions.toSeq.sorted.map(_.trim.split("\n\n").filterNot(_=="").toSeq.map(_.split("\n").toSeq))
  def isVulnerable: Boolean = vulnerabilities.nonEmpty
  def maxCvssScore = (Seq(None) ++ vulnerabilities.map(_.cvssScore)).max
  def ysdssScore = maxCvssScore.map(_ * projects.size)
  def descriptions = dependencies.keySet.map(_.description)
  def projects = dependencies.values.flatten.toSet
  def fileNames = dependencies.keySet.map(_.fileName)
  def hashes = dependencies.keys.head.hashes // valid since all deps in a group have the same hashes
  val sha1 = hashes.sha1
  def identifiers: Set[Identifier] = dependencies.keySet.flatMap(_.identifiers)
  def evidenceCollected: Set[Evidence] = dependencies.keySet.flatMap(_.evidenceCollected)
  def suppressedIdentifiers: Set[Identifier] = dependencies.keySet.flatMap(_.suppressedIdentifiers)
  def mavenIdentifiers = identifiers.filter(_.identifierType == "maven")
  def cpeIdentifiers = identifiers.filter(_.identifierType == "cpe")
  def vulnerabilities: Set[Vulnerability] = dependencies.keySet.flatMap(_.vulnerabilities)
  def suppressedVulnerabilities: Set[Vulnerability] = dependencies.keySet.flatMap(_.suppressedVulnerabilities)
  def plainLibraryIdentifiers: Set[PlainLibraryIdentifier] = identifiers.flatMap(_.toLibraryIdentifierOption)
  def hasCpe: Boolean = cpeIdentifiers.nonEmpty
  def identifiersWithFilenames(threshold: Confidence) = {
    def fileNameIdentifiers = fileNames.toIndexedSeq.sorted.map(filename => Identifier(
      identifierType = "file",
      name = filename,
      confidence = Confidence.Highest,
      url = ""
    ))
    val identifiersSeq =
      if(identifiers.exists(_.confidence >= threshold)) identifiers
      else fileNameIdentifiers ++ identifiers // If we don't know any reliable identifier, add filenames
    identifiersSeq.toIndexedSeq.sortBy(_.name)
  }
}

object GroupedDependency{
  private val groupToSet = (_: Seq[(Dependency, ReportInfo)]).map(_._2).toSet // reduces number of lambda instances
  def apply(deps: Seq[(Dependency, ReportInfo)]): GroupedDependency = {
    GroupedDependency(deps.groupBy(_._1).mapValues(groupToSet))
  } // TODO: the groupBy seems to be a CPU hog (because of GroupedDependency.equals); The mapValues is lazy, so its repeated might also be a performance hog, but I doubt that values are used frequently.
}

object Confidence extends Enumeration {
  type Confidence = Value
  // Order is important
  val Low = Value("LOW")
  val Medium = Value("MEDIUM")
  val High = Value("HIGH")
  val Highest = Value("HIGHEST")

}

final case class Reference(source: String, url: String, name: String)

final case class VulnerableSoftware(allPreviousVersion: Boolean, name: String)

final case class CvssRating(score: Option[Double], authenticationr: Option[String], availabilityImpact: Option[String], accessVector: Option[String], integrityImpact: Option[String], accessComplexity: Option[String], confidentialImpact: Option[String])

final case class CWE private(name: String) /*extends AnyVal*/{ // extends AnyVal prevents pooling
  override def toString = name
  def brief = name.takeWhile(_ != ' ')
  def numberOption: Option[Int] = if(brief startsWith "CWE-") try {
    Some(brief.substring(4).toInt)
  } catch {
    case _: NumberFormatException => None
  } else None
}

object CWE{
  private val cwePool = new ObjectPool()
  def forIdentifierWithDescription(name: String) = cwePool(new CWE(name))
}

final case class Vulnerability(name: String, cweOption: Option[CWE], cvss: CvssRating, description: String, vulnerableSoftware: Seq[VulnerableSoftware], references: Seq[Reference]){
  def cvssScore = cvss.score
  def ysvssScore(affectedDeps: Set[GroupedDependency]) = cvssScore.map(_ * affectedDeps.flatMap(_.projects).toSet.size)
}

final case class Identifier(name: String, confidence: Confidence.Confidence, url: String, identifierType: String) {
  def toLibraryIdentifierOption: Option[PlainLibraryIdentifier] = {
    if(identifierType == "maven"){
      val groupId::artifactId::_ = name.split(':').toList
      Some(PlainLibraryIdentifier(libraryType = LibraryType.Maven, libraryIdentifier = s"$groupId:$artifactId"))
    }else{
      None
    }
  }
  def toCpeIdentifierOption: Option[String] = identifierType match {
    case "cpe" => Some(name)
    case _ => None
  }
  //def isClassifiedInSet(set: Set[PlainLibraryIdentifier]): Boolean = toLibraryIdentifierOption.exists(set contains _)
}

object OdcParser {

  private val vulnPool = new ObjectPool()
  private val evidencePool = new ObjectPool()
  private val dependencyPool = new ObjectPool()
  private val identifierPool = new ObjectPool()
  private val vulnerableSoftwarePool = new ObjectPool()

  def filterWhitespace(node: Node) = node.nonEmptyChildren.filter{
    case t: scala.xml.Text if t.text.trim == "" => false
    case t: scala.xml.PCData if t.text.trim == "" => false
    case _ => true
  }

  def checkElements(node: Node, knownElements: Set[String]) {
    val subelementNames = filterWhitespace(node).map(_.label).toSet
    val unknownElements = subelementNames -- knownElements
    if(unknownElements.nonEmpty){
      sys.error("Unknown elements for "+node.label+": "+unknownElements)
    }
  }

  private def getAttributes(data: MetaData): List[String] = data match {
    case Null => Nil
    case Attribute(key, _, next) => key :: getAttributes(next)
  }

  def checkParams(node: Node, knownParams: Set[String]) {
    val paramNames = getAttributes(node.attributes).toSet
    val unknownParams = paramNames -- knownParams
    if(unknownParams.nonEmpty){
      sys.error("Unknown params for "+node.label+": "+unknownParams)
    }
  }


  def parseVulnerableSoftware(node: Node): VulnerableSoftware = {
    checkElements(node, Set("#PCDATA"))
    checkParams(node, Set("allPreviousVersion"))
    if(node.label != "software"){
      sys.error(s"Unexpected element for vulnerableSoftware: ${node.label}")
    }
    vulnerableSoftwarePool(VulnerableSoftware(
      name = node.text,
      allPreviousVersion = node.attribute("allPreviousVersion").map(_.text).map(Map("true"->true, "false"->false)).getOrElse(false)
    ))
  }

  def parseReference(node: Node): Reference = {
    checkElements(node, Set("source", "url", "name"))
    checkParams(node, Set())
    if(node.label != "reference"){
      sys.error(s"Unexpected element for reference: ${node.label}")
    }
    Reference(
      source = (node \ "source").text,
      url = (node \ "url").text,
      name = (node \ "name").text
    )
  }

  def parseVulnerability(node: Node, expectedLabel: String = "vulnerability"): Vulnerability = {
    checkElements(node, Set("name", "severity", "cwe", "cvssScore", "description", "references", "vulnerableSoftware", "cvssAuthenticationr", "cvssAvailabilityImpact", "cvssAccessVector", "cvssIntegrityImpact", "cvssAccessComplexity", "cvssConfidentialImpact"))
    if(node.label != expectedLabel){
      sys.error(s"Unexpected element for vuln: ${node.label}")
    }
    def t(ns: NodeSeq) = {
      ns match {
        case Seq() => None
        case Seq(one) =>
          one.attributes match {
            case Null =>
              one.child match {
                case Seq(hopefullyTextChild) =>
                  hopefullyTextChild match {
                    case Text(data) => Some(data)
                  }
              }
          }
      }
    }
    vulnPool(Vulnerability(
      name = (node \ "name").text,
      //severity = (node \ "severity"), <- severity is useless, as it is computed from cvssScore :D
      cweOption = (node \ "cwe").headOption.map(_.text).map(CWE.forIdentifierWithDescription),
      description = (node \ "description").text,
      cvss = CvssRating(
        score = (node \ "cvssScore").headOption.map(_.text.toDouble),
        authenticationr = t(node \ "cvssAuthenticationr"),
        availabilityImpact = t(node \ "cvssAvailabilityImpact"),
        accessVector = t(node \ "cvssAccessVector"),
        integrityImpact = t(node \ "cvssIntegrityImpact"),
        accessComplexity = t(node \ "cvssAccessComplexity"),
        confidentialImpact = t(node \ "cvssConfidentialImpact")
      ),
      references = (node \ "references").flatMap(filterWhitespace).map(parseReference(_)),
      vulnerableSoftware = (node \ "vulnerableSoftware").flatMap(filterWhitespace).map(parseVulnerableSoftware)
    ))
  }

  def parseIdentifier(node: Node, expectedLabel: String): Identifier = {
    if(node.label != expectedLabel){
      sys.error("Unexpected label for identifier: "+node.label)
    }
    checkElements(node, Set("name", "url"))
    checkParams(node, Set("type", "confidence"))
    val ExtractPattern = """\((.*)\)""".r
    identifierPool(Identifier(
      name = (node \ "name").text match {
        case ExtractPattern(text) => text
      },
      url = (node \ "url").text,
      identifierType = node.attribute("type").get.text,
      confidence = Confidence.withName(node.attribute("confidence").get.text)
    ))
  }

  def parseDependency(node: Node): Dependency = {
    checkElements(node, Set("fileName", "filePath", "md5", "sha1", "description", "evidenceCollected", "identifiers", "license", "vulnerabilities", "relatedDependencies"))
    checkParams(node, Set())
    val (vulnerabilities: Seq[Node], suppressedVulnerabilities: Seq[Node]) = (node \ "vulnerabilities").headOption.map(filterWhitespace).getOrElse(Seq()).partition(_.label == "vulnerability")
    val (identifiers, suppressedIdentifiers) = (node \ "identifiers").headOption.map(filterWhitespace).getOrElse(Seq()).partition(_.label == "identifier")
    dependencyPool(Dependency(
      fileName = (node \ "fileName").text,
      filePath = (node \ "filePath").text,
      md5 = (node \ "md5").text,
      sha1 = (node \ "sha1").text,
      description = (node \ "description").text,
      evidenceCollected = filterWhitespace((node \ "evidenceCollected").head).map(parseEvidence).toSet,
      identifiers = identifiers.map(parseIdentifier(_, "identifier")),
      suppressedIdentifiers = suppressedIdentifiers.map(parseIdentifier(_, "suppressedIdentifier")),
      license = (node \ "license").text,
      vulnerabilities = vulnerabilities.map(parseVulnerability(_)),
      suppressedVulnerabilities = suppressedVulnerabilities.map(parseVulnerability(_, "suppressedVulnerability")),
      relatedDependencies = SerializableXml(node \ "relatedDependencies")
    ))
  }

  def parseEvidence(node: Node): Evidence = {
    if(node.label != "evidence"){
      sys.error(s"Unexpected element for evidence: ${node.label}")
    }
    checkElements(node, Set("source", "name", "value"))
    checkParams(node, Set("confidence", "type"))
    evidencePool(Evidence(
      source = (node \ "source").text,
      name = (node \ "name").text,
      value = (node \ "value").text,
      confidence = node.attribute("confidence").map(_.text).get,
      evidenceType = node.attribute("type").map(_.text).get
    ))
  }

  def parseDependencies(nodes: NodeSeq): Seq[Dependency] = nodes.map(parseDependency(_))

  def parseXmlReport(data: Array[Byte]) = {
    val xml = SecureXml.loadString(new String(data, "utf-8"))
    Analysis(
      scanInfo = SerializableXml((xml \ "scanInfo").head),
      name = (xml \ "projectInfo" \ "name").text,
      reportDate = DateTime.parse((xml \ "projectInfo" \ "reportDate").text),
      dependencies = parseDependencies(xml \ "dependencies" \ "dependency").toIndexedSeq
    )
  }

}