Add:HTML sanitizer lib to support html in podcasts and replace strip html lib

2026-06-06 02:32:44 +02:00 · 2022-05-27 19:41:40 -05:00
parent 96232676cb
commit c4bfa266b0
9 changed files with 1051 additions and 252 deletions
@@ -0,0 +1,874 @@
+/*
+  sanitize-html (Apostrophe Technologies)
+  SOURCE: https://github.com/apostrophecms/sanitize-html
+  LICENSE: https://github.com/apostrophecms/sanitize-html/blob/main/LICENSE
+
+  Modified for audiobookshelf
+*/
+
+const htmlparser = require('htmlparser2');
+// const escapeStringRegexp = require('escape-string-regexp');
+// const { isPlainObject } = require('is-plain-object');
+// const deepmerge = require('deepmerge');
+// const parseSrcset = require('parse-srcset');
+// const { parse: postcssParse } = require('postcss');
+// Tags that can conceivably represent stand-alone media.
+
+// ABS UPDATE: Packages not necessary
+// SOURCE: https://github.com/sindresorhus/escape-string-regexp/blob/main/index.js
+function escapeStringRegexp(string) {
+  if (typeof string !== 'string') {
+    throw new TypeError('Expected a string');
+  }
+
+  // Escape characters with special meaning either inside or outside character sets.
+  // Use a simple backslash escape when it’s always valid, and a `\xnn` escape when the simpler form would be disallowed by Unicode patterns’ stricter grammar.
+  return string
+    .replace(/[|\\{}()[\]^$+*?.]/g, '\\$&')
+    .replace(/-/g, '\\x2d');
+}
+
+// SOURCE: https://github.com/jonschlinkert/is-plain-object/blob/master/is-plain-object.js
+function isObject(o) {
+  return Object.prototype.toString.call(o) === '[object Object]';
+}
+
+function isPlainObject(o) {
+  var ctor, prot;
+
+  if (isObject(o) === false) return false;
+
+  // If has modified constructor
+  ctor = o.constructor;
+  if (ctor === undefined) return true;
+
+  // If has modified prototype
+  prot = ctor.prototype;
+  if (isObject(prot) === false) return false;
+
+  // If constructor does not have an Object-specific method
+  if (prot.hasOwnProperty('isPrototypeOf') === false) {
+    return false;
+  }
+
+  // Most likely a plain Object
+  return true;
+};
+
+
+const mediaTags = [
+  'img', 'audio', 'video', 'picture', 'svg',
+  'object', 'map', 'iframe', 'embed'
+];
+// Tags that are inherently vulnerable to being used in XSS attacks.
+const vulnerableTags = ['script', 'style'];
+
+function each(obj, cb) {
+  if (obj) {
+    Object.keys(obj).forEach(function (key) {
+      cb(obj[key], key);
+    });
+  }
+}
+
+// Avoid false positives with .__proto__, .hasOwnProperty, etc.
+function has(obj, key) {
+  return ({}).hasOwnProperty.call(obj, key);
+}
+
+// Returns those elements of `a` for which `cb(a)` returns truthy
+function filter(a, cb) {
+  const n = [];
+  each(a, function (v) {
+    if (cb(v)) {
+      n.push(v);
+    }
+  });
+  return n;
+}
+
+function isEmptyObject(obj) {
+  for (const key in obj) {
+    if (has(obj, key)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+function stringifySrcset(parsedSrcset) {
+  return parsedSrcset.map(function (part) {
+    if (!part.url) {
+      throw new Error('URL missing');
+    }
+
+    return (
+      part.url +
+      (part.w ? ` ${part.w}w` : '') +
+      (part.h ? ` ${part.h}h` : '') +
+      (part.d ? ` ${part.d}x` : '')
+    );
+  }).join(', ');
+}
+
+module.exports = sanitizeHtml;
+
+// A valid attribute name.
+// We use a tolerant definition based on the set of strings defined by
+// html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
+// and html.spec.whatwg.org/multipage/parsing.html#attribute-name-state .
+// The characters accepted are ones which can be appended to the attribute
+// name buffer without triggering a parse error:
+//   * unexpected-equals-sign-before-attribute-name
+//   * unexpected-null-character
+//   * unexpected-character-in-attribute-name
+// We exclude the empty string because it's impossible to get to the after
+// attribute name state with an empty attribute name buffer.
+const VALID_HTML_ATTRIBUTE_NAME = /^[^\0\t\n\f\r /<=>]+$/;
+
+// Ignore the _recursing flag; it's there for recursive
+// invocation as a guard against this exploit:
+// https://github.com/fb55/htmlparser2/issues/105
+
+function sanitizeHtml(html, options, _recursing) {
+  if (html == null) {
+    return '';
+  }
+
+  let result = '';
+  // Used for hot swapping the result variable with an empty string in order to "capture" the text written to it.
+  let tempResult = '';
+
+  function Frame(tag, attribs) {
+    const that = this;
+    this.tag = tag;
+    this.attribs = attribs || {};
+    this.tagPosition = result.length;
+    this.text = ''; // Node inner text
+    this.mediaChildren = [];
+
+    this.updateParentNodeText = function () {
+      if (stack.length) {
+        const parentFrame = stack[stack.length - 1];
+        parentFrame.text += that.text;
+      }
+    };
+
+    this.updateParentNodeMediaChildren = function () {
+      if (stack.length && mediaTags.includes(this.tag)) {
+        const parentFrame = stack[stack.length - 1];
+        parentFrame.mediaChildren.push(this.tag);
+      }
+    };
+  }
+
+  options = Object.assign({}, sanitizeHtml.defaults, options);
+  options.parser = Object.assign({}, htmlParserDefaults, options.parser);
+
+  // vulnerableTags
+  vulnerableTags.forEach(function (tag) {
+    if (
+      options.allowedTags && options.allowedTags.indexOf(tag) > -1 &&
+      !options.allowVulnerableTags
+    ) {
+      console.warn(`\n\n⚠️ Your \`allowedTags\` option includes, \`${tag}\`, which is inherently\nvulnerable to XSS attacks. Please remove it from \`allowedTags\`.\nOr, to disable this warning, add the \`allowVulnerableTags\` option\nand ensure you are accounting for this risk.\n\n`);
+    }
+  });
+
+  // Tags that contain something other than HTML, or where discarding
+  // the text when the tag is disallowed makes sense for other reasons.
+  // If we are not allowing these tags, we should drop their content too.
+  // For other tags you would drop the tag but keep its content.
+  const nonTextTagsArray = options.nonTextTags || [
+    'script',
+    'style',
+    'textarea',
+    'option'
+  ];
+  let allowedAttributesMap;
+  let allowedAttributesGlobMap;
+  if (options.allowedAttributes) {
+    allowedAttributesMap = {};
+    allowedAttributesGlobMap = {};
+    each(options.allowedAttributes, function (attributes, tag) {
+      allowedAttributesMap[tag] = [];
+      const globRegex = [];
+      attributes.forEach(function (obj) {
+        if (typeof obj === 'string' && obj.indexOf('*') >= 0) {
+          globRegex.push(escapeStringRegexp(obj).replace(/\\\*/g, '.*'));
+        } else {
+          allowedAttributesMap[tag].push(obj);
+        }
+      });
+      if (globRegex.length) {
+        allowedAttributesGlobMap[tag] = new RegExp('^(' + globRegex.join('|') + ')$');
+      }
+    });
+  }
+  const allowedClassesMap = {};
+  const allowedClassesGlobMap = {};
+  const allowedClassesRegexMap = {};
+  each(options.allowedClasses, function (classes, tag) {
+    // Implicitly allows the class attribute
+    if (allowedAttributesMap) {
+      if (!has(allowedAttributesMap, tag)) {
+        allowedAttributesMap[tag] = [];
+      }
+      allowedAttributesMap[tag].push('class');
+    }
+
+    allowedClassesMap[tag] = [];
+    allowedClassesRegexMap[tag] = [];
+    const globRegex = [];
+    classes.forEach(function (obj) {
+      if (typeof obj === 'string' && obj.indexOf('*') >= 0) {
+        globRegex.push(escapeStringRegexp(obj).replace(/\\\*/g, '.*'));
+      } else if (obj instanceof RegExp) {
+        allowedClassesRegexMap[tag].push(obj);
+      } else {
+        allowedClassesMap[tag].push(obj);
+      }
+    });
+    if (globRegex.length) {
+      allowedClassesGlobMap[tag] = new RegExp('^(' + globRegex.join('|') + ')$');
+    }
+  });
+
+  const transformTagsMap = {};
+  let transformTagsAll;
+  each(options.transformTags, function (transform, tag) {
+    let transFun;
+    if (typeof transform === 'function') {
+      transFun = transform;
+    } else if (typeof transform === 'string') {
+      transFun = sanitizeHtml.simpleTransform(transform);
+    }
+    if (tag === '*') {
+      transformTagsAll = transFun;
+    } else {
+      transformTagsMap[tag] = transFun;
+    }
+  });
+
+  let depth;
+  let stack;
+  let skipMap;
+  let transformMap;
+  let skipText;
+  let skipTextDepth;
+  let addedText = false;
+
+  initializeState();
+
+  const parser = new htmlparser.Parser({
+    onopentag: function (name, attribs) {
+      // If `enforceHtmlBoundary` is `true` and this has found the opening
+      // `html` tag, reset the state.
+      if (options.enforceHtmlBoundary && name === 'html') {
+        initializeState();
+      }
+
+      if (skipText) {
+        skipTextDepth++;
+        return;
+      }
+      const frame = new Frame(name, attribs);
+      stack.push(frame);
+
+      let skip = false;
+      const hasText = !!frame.text;
+      let transformedTag;
+      if (has(transformTagsMap, name)) {
+        transformedTag = transformTagsMap[name](name, attribs);
+
+        frame.attribs = attribs = transformedTag.attribs;
+
+        if (transformedTag.text !== undefined) {
+          frame.innerText = transformedTag.text;
+        }
+
+        if (name !== transformedTag.tagName) {
+          frame.name = name = transformedTag.tagName;
+          transformMap[depth] = transformedTag.tagName;
+        }
+      }
+      if (transformTagsAll) {
+        transformedTag = transformTagsAll(name, attribs);
+
+        frame.attribs = attribs = transformedTag.attribs;
+        if (name !== transformedTag.tagName) {
+          frame.name = name = transformedTag.tagName;
+          transformMap[depth] = transformedTag.tagName;
+        }
+      }
+
+      if ((options.allowedTags && options.allowedTags.indexOf(name) === -1) || (options.disallowedTagsMode === 'recursiveEscape' && !isEmptyObject(skipMap)) || (options.nestingLimit != null && depth >= options.nestingLimit)) {
+        skip = true;
+        skipMap[depth] = true;
+        if (options.disallowedTagsMode === 'discard') {
+          if (nonTextTagsArray.indexOf(name) !== -1) {
+            skipText = true;
+            skipTextDepth = 1;
+          }
+        }
+        skipMap[depth] = true;
+      }
+      depth++;
+      if (skip) {
+        if (options.disallowedTagsMode === 'discard') {
+          // We want the contents but not this tag
+          return;
+        }
+        tempResult = result;
+        result = '';
+      }
+      result += '<' + name;
+
+      if (name === 'script') {
+        if (options.allowedScriptHostnames || options.allowedScriptDomains) {
+          frame.innerText = '';
+        }
+      }
+
+      if (!allowedAttributesMap || has(allowedAttributesMap, name) || allowedAttributesMap['*']) {
+        each(attribs, function (value, a) {
+          if (!VALID_HTML_ATTRIBUTE_NAME.test(a)) {
+            // This prevents part of an attribute name in the output from being
+            // interpreted as the end of an attribute, or end of a tag.
+            delete frame.attribs[a];
+            return;
+          }
+          let parsed;
+          // check allowedAttributesMap for the element and attribute and modify the value
+          // as necessary if there are specific values defined.
+          let passedAllowedAttributesMapCheck = false;
+          if (!allowedAttributesMap ||
+            (has(allowedAttributesMap, name) && allowedAttributesMap[name].indexOf(a) !== -1) ||
+            (allowedAttributesMap['*'] && allowedAttributesMap['*'].indexOf(a) !== -1) ||
+            (has(allowedAttributesGlobMap, name) && allowedAttributesGlobMap[name].test(a)) ||
+            (allowedAttributesGlobMap['*'] && allowedAttributesGlobMap['*'].test(a))) {
+            passedAllowedAttributesMapCheck = true;
+          } else if (allowedAttributesMap && allowedAttributesMap[name]) {
+            for (const o of allowedAttributesMap[name]) {
+              if (isPlainObject(o) && o.name && (o.name === a)) {
+                passedAllowedAttributesMapCheck = true;
+                let newValue = '';
+                if (o.multiple === true) {
+                  // verify the values that are allowed
+                  const splitStrArray = value.split(' ');
+                  for (const s of splitStrArray) {
+                    if (o.values.indexOf(s) !== -1) {
+                      if (newValue === '') {
+                        newValue = s;
+                      } else {
+                        newValue += ' ' + s;
+                      }
+                    }
+                  }
+                } else if (o.values.indexOf(value) >= 0) {
+                  // verified an allowed value matches the entire attribute value
+                  newValue = value;
+                }
+                value = newValue;
+              }
+            }
+          }
+          if (passedAllowedAttributesMapCheck) {
+            if (options.allowedSchemesAppliedToAttributes.indexOf(a) !== -1) {
+              if (naughtyHref(name, value)) {
+                delete frame.attribs[a];
+                return;
+              }
+            }
+
+            if (name === 'script' && a === 'src') {
+
+              let allowed = true;
+
+              try {
+                const parsed = new URL(value);
+
+                if (options.allowedScriptHostnames || options.allowedScriptDomains) {
+                  const allowedHostname = (options.allowedScriptHostnames || []).find(function (hostname) {
+                    return hostname === parsed.hostname;
+                  });
+                  const allowedDomain = (options.allowedScriptDomains || []).find(function (domain) {
+                    return parsed.hostname === domain || parsed.hostname.endsWith(`.${domain}`);
+                  });
+                  allowed = allowedHostname || allowedDomain;
+                }
+              } catch (e) {
+                allowed = false;
+              }
+
+              if (!allowed) {
+                delete frame.attribs[a];
+                return;
+              }
+            }
+
+            if (name === 'iframe' && a === 'src') {
+              let allowed = true;
+              try {
+                // Chrome accepts \ as a substitute for / in the // at the
+                // start of a URL, so rewrite accordingly to prevent exploit.
+                // Also drop any whitespace at that point in the URL
+                value = value.replace(/^(\w+:)?\s*[\\/]\s*[\\/]/, '$1//');
+                if (value.startsWith('relative:')) {
+                  // An attempt to exploit our workaround for base URLs being
+                  // mandatory for relative URL validation in the WHATWG
+                  // URL parser, reject it
+                  throw new Error('relative: exploit attempt');
+                }
+                // naughtyHref is in charge of whether protocol relative URLs
+                // are cool. Here we are concerned just with allowed hostnames and
+                // whether to allow relative URLs.
+                //
+                // Build a placeholder "base URL" against which any reasonable
+                // relative URL may be parsed successfully
+                let base = 'relative://relative-site';
+                for (let i = 0; (i < 100); i++) {
+                  base += `/${i}`;
+                }
+                const parsed = new URL(value, base);
+                const isRelativeUrl = parsed && parsed.hostname === 'relative-site' && parsed.protocol === 'relative:';
+                if (isRelativeUrl) {
+                  // default value of allowIframeRelativeUrls is true
+                  // unless allowedIframeHostnames or allowedIframeDomains specified
+                  allowed = has(options, 'allowIframeRelativeUrls')
+                    ? options.allowIframeRelativeUrls
+                    : (!options.allowedIframeHostnames && !options.allowedIframeDomains);
+                } else if (options.allowedIframeHostnames || options.allowedIframeDomains) {
+                  const allowedHostname = (options.allowedIframeHostnames || []).find(function (hostname) {
+                    return hostname === parsed.hostname;
+                  });
+                  const allowedDomain = (options.allowedIframeDomains || []).find(function (domain) {
+                    return parsed.hostname === domain || parsed.hostname.endsWith(`.${domain}`);
+                  });
+                  allowed = allowedHostname || allowedDomain;
+                }
+              } catch (e) {
+                // Unparseable iframe src
+                allowed = false;
+              }
+              if (!allowed) {
+                delete frame.attribs[a];
+                return;
+              }
+            }
+            if (a === 'srcset') {
+              delete frame.attribs[a];
+
+              // ABS UPDATE: srcset not necessary
+              // try {
+              //   parsed = parseSrcset(value);
+              //   parsed.forEach(function (value) {
+              //     if (naughtyHref('srcset', value.url)) {
+              //       value.evil = true;
+              //     }
+              //   });
+              //   parsed = filter(parsed, function (v) {
+              //     return !v.evil;
+              //   });
+              //   if (!parsed.length) {
+              //     delete frame.attribs[a];
+              //     return;
+              //   } else {
+              //     value = stringifySrcset(filter(parsed, function (v) {
+              //       return !v.evil;
+              //     }));
+              //     frame.attribs[a] = value;
+              //   }
+              // } catch (e) {
+              //   // Unparseable srcset
+              //   delete frame.attribs[a];
+              //   return;
+              // }
+            }
+            if (a === 'class') {
+              const allowedSpecificClasses = allowedClassesMap[name];
+              const allowedWildcardClasses = allowedClassesMap['*'];
+              const allowedSpecificClassesGlob = allowedClassesGlobMap[name];
+              const allowedSpecificClassesRegex = allowedClassesRegexMap[name];
+              const allowedWildcardClassesGlob = allowedClassesGlobMap['*'];
+              const allowedClassesGlobs = [
+                allowedSpecificClassesGlob,
+                allowedWildcardClassesGlob
+              ]
+                .concat(allowedSpecificClassesRegex)
+                .filter(function (t) {
+                  return t;
+                });
+              if (allowedSpecificClasses && allowedWildcardClasses) {
+                // ABS UPDATE: classes and wildcard classes not necessary now
+                // value = filterClasses(value, deepmerge(allowedSpecificClasses, allowedWildcardClasses), allowedClassesGlobs);
+              } else {
+                value = filterClasses(value, allowedSpecificClasses || allowedWildcardClasses, allowedClassesGlobs);
+              }
+              if (!value.length) {
+                delete frame.attribs[a];
+                return;
+              }
+            }
+            if (a === 'style') {
+              delete frame.attribs[a];
+
+              // ABS UPDATE: Styles not necessary
+              // try {
+              //   const abstractSyntaxTree = postcssParse(name + ' {' + value + '}');
+              //   const filteredAST = filterCss(abstractSyntaxTree, options.allowedStyles);
+
+              //   value = stringifyStyleAttributes(filteredAST);
+
+              //   if (value.length === 0) {
+              //     delete frame.attribs[a];
+              //     return;
+              //   }
+              // } catch (e) {
+              //   delete frame.attribs[a];
+              //   return;
+              // }
+            }
+            result += ' ' + a;
+            if (value && value.length) {
+              result += '="' + escapeHtml(value, true) + '"';
+            }
+          } else {
+            delete frame.attribs[a];
+          }
+        });
+      }
+      if (options.selfClosing.indexOf(name) !== -1) {
+        result += ' />';
+      } else {
+        result += '>';
+        if (frame.innerText && !hasText && !options.textFilter) {
+          result += escapeHtml(frame.innerText);
+          addedText = true;
+        }
+      }
+      if (skip) {
+        result = tempResult + escapeHtml(result);
+        tempResult = '';
+      }
+    },
+    ontext: function (text) {
+      if (skipText) {
+        return;
+      }
+      const lastFrame = stack[stack.length - 1];
+      let tag;
+
+      if (lastFrame) {
+        tag = lastFrame.tag;
+        // If inner text was set by transform function then let's use it
+        text = lastFrame.innerText !== undefined ? lastFrame.innerText : text;
+      }
+
+      if (options.disallowedTagsMode === 'discard' && ((tag === 'script') || (tag === 'style'))) {
+        // htmlparser2 gives us these as-is. Escaping them ruins the content. Allowing
+        // script tags is, by definition, game over for XSS protection, so if that's
+        // your concern, don't allow them. The same is essentially true for style tags
+        // which have their own collection of XSS vectors.
+        result += text;
+      } else {
+        const escaped = escapeHtml(text, false);
+        if (options.textFilter && !addedText) {
+          result += options.textFilter(escaped, tag);
+        } else if (!addedText) {
+          result += escaped;
+        }
+      }
+      if (stack.length) {
+        const frame = stack[stack.length - 1];
+        frame.text += text;
+      }
+    },
+    onclosetag: function (name) {
+
+      if (skipText) {
+        skipTextDepth--;
+        if (!skipTextDepth) {
+          skipText = false;
+        } else {
+          return;
+        }
+      }
+
+      const frame = stack.pop();
+      if (!frame) {
+        // Do not crash on bad markup
+        return;
+      }
+      skipText = options.enforceHtmlBoundary ? name === 'html' : false;
+      depth--;
+      const skip = skipMap[depth];
+      if (skip) {
+        delete skipMap[depth];
+        if (options.disallowedTagsMode === 'discard') {
+          frame.updateParentNodeText();
+          return;
+        }
+        tempResult = result;
+        result = '';
+      }
+
+      if (transformMap[depth]) {
+        name = transformMap[depth];
+        delete transformMap[depth];
+      }
+
+      if (options.exclusiveFilter && options.exclusiveFilter(frame)) {
+        result = result.substr(0, frame.tagPosition);
+        return;
+      }
+
+      frame.updateParentNodeMediaChildren();
+      frame.updateParentNodeText();
+
+      if (options.selfClosing.indexOf(name) !== -1) {
+        // Already output />
+        if (skip) {
+          result = tempResult;
+          tempResult = '';
+        }
+        return;
+      }
+
+      result += '</' + name + '>';
+      if (skip) {
+        result = tempResult + escapeHtml(result);
+        tempResult = '';
+      }
+      addedText = false;
+    }
+  }, options.parser);
+  parser.write(html);
+  parser.end();
+
+  return result;
+
+  function initializeState() {
+    result = '';
+    depth = 0;
+    stack = [];
+    skipMap = {};
+    transformMap = {};
+    skipText = false;
+    skipTextDepth = 0;
+  }
+
+  function escapeHtml(s, quote) {
+    if (typeof (s) !== 'string') {
+      s = s + '';
+    }
+    if (options.parser.decodeEntities) {
+      s = s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+      if (quote) {
+        s = s.replace(/"/g, '&quot;');
+      }
+    }
+    // TODO: this is inadequate because it will pass `&0;`. This approach
+    // will not work, each & must be considered with regard to whether it
+    // is followed by a 100% syntactically valid entity or not, and escaped
+    // if it is not. If this bothers you, don't set parser.decodeEntities
+    // to false. (The default is true.)
+    s = s.replace(/&(?![a-zA-Z0-9#]{1,20};)/g, '&amp;') // Match ampersands not part of existing HTML entity
+      .replace(/</g, '&lt;')
+      .replace(/>/g, '&gt;');
+    if (quote) {
+      s = s.replace(/"/g, '&quot;');
+    }
+    return s;
+  }
+
+  function naughtyHref(name, href) {
+    // Browsers ignore character codes of 32 (space) and below in a surprising
+    // number of situations. Start reading here:
+    // https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#Embedded_tab
+    // eslint-disable-next-line no-control-regex
+    href = href.replace(/[\x00-\x20]+/g, '');
+    // Clobber any comments in URLs, which the browser might
+    // interpret inside an XML data island, allowing
+    // a javascript: URL to be snuck through
+    href = href.replace(/<!--.*?-->/g, '');
+    // Case insensitive so we don't get faked out by JAVASCRIPT #1
+    // Allow more characters after the first so we don't get faked
+    // out by certain schemes browsers accept
+    const matches = href.match(/^([a-zA-Z][a-zA-Z0-9.\-+]*):/);
+    if (!matches) {
+      // Protocol-relative URL starting with any combination of '/' and '\'
+      if (href.match(/^[/\\]{2}/)) {
+        return !options.allowProtocolRelative;
+      }
+
+      // No scheme
+      return false;
+    }
+    const scheme = matches[1].toLowerCase();
+
+    if (has(options.allowedSchemesByTag, name)) {
+      return options.allowedSchemesByTag[name].indexOf(scheme) === -1;
+    }
+
+    return !options.allowedSchemes || options.allowedSchemes.indexOf(scheme) === -1;
+  }
+
+  /**
+   * Filters user input css properties by allowlisted regex attributes.
+   * Modifies the abstractSyntaxTree object.
+   *
+   * @param {object} abstractSyntaxTree  - Object representation of CSS attributes.
+   * @property {array[Declaration]} abstractSyntaxTree.nodes[0] - Each object cointains prop and value key, i.e { prop: 'color', value: 'red' }.
+   * @param {object} allowedStyles       - Keys are properties (i.e color), value is list of permitted regex rules (i.e /green/i).
+   * @return {object}                    - The modified tree.
+   */
+  // function filterCss(abstractSyntaxTree, allowedStyles) {
+  //   if (!allowedStyles) {
+  //     return abstractSyntaxTree;
+  //   }
+
+  //   const astRules = abstractSyntaxTree.nodes[0];
+  //   let selectedRule;
+
+  //   // Merge global and tag-specific styles into new AST.
+  //   if (allowedStyles[astRules.selector] && allowedStyles['*']) {
+  //     selectedRule = deepmerge(
+  //       allowedStyles[astRules.selector],
+  //       allowedStyles['*']
+  //     );
+  //   } else {
+  //     selectedRule = allowedStyles[astRules.selector] || allowedStyles['*'];
+  //   }
+
+  //   if (selectedRule) {
+  //     abstractSyntaxTree.nodes[0].nodes = astRules.nodes.reduce(filterDeclarations(selectedRule), []);
+  //   }
+
+  //   return abstractSyntaxTree;
+  // }
+
+  /**
+   * Extracts the style attributes from an AbstractSyntaxTree and formats those
+   * values in the inline style attribute format.
+   *
+   * @param  {AbstractSyntaxTree} filteredAST
+   * @return {string}             - Example: "color:yellow;text-align:center !important;font-family:helvetica;"
+   */
+  function stringifyStyleAttributes(filteredAST) {
+    return filteredAST.nodes[0].nodes
+      .reduce(function (extractedAttributes, attrObject) {
+        extractedAttributes.push(
+          `${attrObject.prop}:${attrObject.value}${attrObject.important ? ' !important' : ''}`
+        );
+        return extractedAttributes;
+      }, [])
+      .join(';');
+  }
+
+  /**
+    * Filters the existing attributes for the given property. Discards any attributes
+    * which don't match the allowlist.
+    *
+    * @param  {object} selectedRule             - Example: { color: red, font-family: helvetica }
+    * @param  {array} allowedDeclarationsList   - List of declarations which pass the allowlist.
+    * @param  {object} attributeObject          - Object representing the current css property.
+    * @property {string} attributeObject.type   - Typically 'declaration'.
+    * @property {string} attributeObject.prop   - The CSS property, i.e 'color'.
+    * @property {string} attributeObject.value  - The corresponding value to the css property, i.e 'red'.
+    * @return {function}                        - When used in Array.reduce, will return an array of Declaration objects
+    */
+  function filterDeclarations(selectedRule) {
+    return function (allowedDeclarationsList, attributeObject) {
+      // If this property is allowlisted...
+      if (has(selectedRule, attributeObject.prop)) {
+        const matchesRegex = selectedRule[attributeObject.prop].some(function (regularExpression) {
+          return regularExpression.test(attributeObject.value);
+        });
+
+        if (matchesRegex) {
+          allowedDeclarationsList.push(attributeObject);
+        }
+      }
+      return allowedDeclarationsList;
+    };
+  }
+
+  function filterClasses(classes, allowed, allowedGlobs) {
+    if (!allowed) {
+      // The class attribute is allowed without filtering on this tag
+      return classes;
+    }
+    classes = classes.split(/\s+/);
+    return classes.filter(function (clss) {
+      return allowed.indexOf(clss) !== -1 || allowedGlobs.some(function (glob) {
+        return glob.test(clss);
+      });
+    }).join(' ');
+  }
+}
+
+// Defaults are accessible to you so that you can use them as a starting point
+// programmatically if you wish
+
+const htmlParserDefaults = {
+  decodeEntities: true
+};
+sanitizeHtml.defaults = {
+  allowedTags: [
+    // Sections derived from MDN element categories and limited to the more
+    // benign categories.
+    // https://developer.mozilla.org/en-US/docs/Web/HTML/Element
+    // Content sectioning
+    'address', 'article', 'aside', 'footer', 'header',
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup',
+    'main', 'nav', 'section',
+    // Text content
+    'blockquote', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure',
+    'hr', 'li', 'main', 'ol', 'p', 'pre', 'ul',
+    // Inline text semantics
+    'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', 'code', 'data', 'dfn',
+    'em', 'i', 'kbd', 'mark', 'q',
+    'rb', 'rp', 'rt', 'rtc', 'ruby',
+    's', 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr',
+    // Table content
+    'caption', 'col', 'colgroup', 'table', 'tbody', 'td', 'tfoot', 'th',
+    'thead', 'tr'
+  ],
+  disallowedTagsMode: 'discard',
+  allowedAttributes: {
+    a: ['href', 'name', 'target'],
+    // We don't currently allow img itself by default, but
+    // these attributes would make sense if we did.
+    img: ['src', 'srcset', 'alt', 'title', 'width', 'height', 'loading']
+  },
+  // Lots of these won't come up by default because we don't allow them
+  selfClosing: ['img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta'],
+  // URL schemes we permit
+  allowedSchemes: ['http', 'https', 'ftp', 'mailto', 'tel'],
+  allowedSchemesByTag: {},
+  allowedSchemesAppliedToAttributes: ['href', 'src', 'cite'],
+  allowProtocolRelative: true,
+  enforceHtmlBoundary: false
+};
+
+sanitizeHtml.simpleTransform = function (newTagName, newAttribs, merge) {
+  merge = (merge === undefined) ? true : merge;
+  newAttribs = newAttribs || {};
+
+  return function (tagName, attribs) {
+    let attrib;
+    if (merge) {
+      for (attrib in newAttribs) {
+        attribs[attrib] = newAttribs[attrib];
+      }
+    } else {
+      attribs = newAttribs;
+    }
+
+    return {
+      tagName: newTagName,
+      attribs: attribs
+    };
+  };
+};
@@ -1,4 +1,3 @@
-const { stripHtml } = require('string-strip-html')
 const { getId } = require('../../utils/index')
 const AudioFile = require('../files/AudioFile')
 const AudioTrack = require('../files/AudioTrack')
@@ -78,8 +77,7 @@ class PodcastEpisode {
      episodeType: this.episodeType,
      title: this.title,
      subtitle: this.subtitle,
-      // description: this.description,
-      description: this.descriptionPlain, // Temporary stripping HTML until proper cleaning is implemented
+      description: this.description,
      enclosure: this.enclosure ? { ...this.enclosure } : null,
      pubDate: this.pubDate,
      audioFile: this.audioFile.toJSON(),
@@ -108,10 +106,6 @@ class PodcastEpisode {
    if (this.episode) return `${this.episode} - ${this.title}`
    return this.title
  }
-  get descriptionPlain() {
-    if (!this.description) return ''
-    return stripHtml(this.description).result
-  }

  setData(data, index = 1) {
    this.id = getId('ep')
@@ -1,5 +1,5 @@
 const axios = require('axios')
-const { stripHtml } = require('string-strip-html')
+const htmlSanitizer = require('../utils/htmlSanitizer')
 const Logger = require('../Logger')

 class Audible {
@@ -17,7 +17,7 @@ class Audible {
            narrator: narrators ? narrators.map(({ name }) => name).join(', ') : null,
            publisher: publisher_name,
            publishedYear: release_date ? release_date.split('-')[0] : null,
-            description: publisher_summary ? stripHtml(publisher_summary).result : null,
+            description: publisher_summary ? htmlSanitizer.stripAllTags(publisher_summary) : null,
            cover: this.getBestImageLink(product_images),
            asin,
            series: primarySeries ? primarySeries.title : null,
@@ -1,6 +1,7 @@
 const axios = require('axios')
 const Logger = require('../Logger')
-const { stripHtml } = require('string-strip-html')
+const htmlSanitizer = require('../utils/htmlSanitizer')
+
 class iTunes {
  constructor() { }

@@ -64,7 +65,7 @@ class iTunes {
      artistId: data.artistId,
      title: data.collectionName,
      author: data.artistName,
-      description: stripHtml(data.description || '').result,
+      description: htmlSanitizer.stripAllTags(data.description || ''),
      publishedYear: data.releaseDate ? data.releaseDate.split('-')[0] : null,
      genres: data.primaryGenreName ? [data.primaryGenreName] : [],
      cover: this.getCoverArtwork(data)
@@ -83,7 +84,8 @@ class iTunes {
      artistId: data.artistId || null,
      title: data.collectionName,
      artistName: data.artistName,
-      description: stripHtml(data.description || '').result,
+      description: htmlSanitizer.sanitize(data.description || ''),
+      descriptionPlain: htmlSanitizer.stripAllTags(data.description || ''),
      releaseDate: data.releaseDate,
      genres: data.genres || [],
      cover: this.getCoverArtwork(data),
@@ -0,0 +1,28 @@
+const sanitizeHtml = require('../libs/sanitizeHtml')
+
+function sanitize(html) {
+  const sanitizerOptions = {
+    allowedTags: [
+      'p', 'ol', 'ul', 'a', 'strong', 'em'
+    ],
+    disallowedTagsMode: 'discard',
+    allowedAttributes: {
+      a: ['href', 'name', 'target']
+    },
+    allowedSchemes: ['https'],
+    allowProtocolRelative: false
+  }
+
+  return sanitizeHtml(html, sanitizerOptions)
+}
+module.exports.sanitize = sanitize
+
+function stripAllTags(html) {
+  const sanitizerOptions = {
+    allowedTags: [],
+    disallowedTagsMode: 'discard'
+  }
+
+  return sanitizeHtml(html, sanitizerOptions)
+}
+module.exports.stripAllTags = stripAllTags
@@ -1,5 +1,5 @@
 const { xmlToJSON } = require('./index')
-const { stripHtml } = require("string-strip-html")
+const htmlSanitizer = require('./htmlSanitizer')

 function parseCreators(metadata) {
  if (!metadata['dc:creator']) return null
@@ -57,8 +57,7 @@ function fetchDescription(metadata) {
  // check if description is HTML or plain text. only plain text allowed
  // calibre stores < and > as &lt; and &gt;
  description = description.replace(/&lt;/g, '<').replace(/&gt;/g, '>')
-  if (description.match(/<!DOCTYPE html>|<\/?\s*[a-z-][^>]*\s*>|(\&(?:[\w\d]+|#\d+|#x[a-f\d]+);)/)) return stripHtml(description).result
-  return description
+  return htmlSanitizer.stripAllTags(description)
 }

 function fetchGenres(metadata) {
@@ -1,6 +1,6 @@
 const Logger = require('../Logger')
 const { xmlToJSON } = require('./index')
-const { stripHtml } = require('string-strip-html')
+const htmlSanitizer = require('../utils/htmlSanitizer')

 function extractFirstArrayItem(json, key) {
  if (!json[key] || !json[key].length) return null
@@ -55,8 +55,9 @@ function extractPodcastMetadata(channel) {
  }

  if (channel['description']) {
-    metadata.description = extractFirstArrayItem(channel, 'description')
-    metadata.descriptionPlain = stripHtml(metadata.description || '').result
+    const rawDescription = extractFirstArrayItem(channel, 'description') || ''
+    metadata.description = htmlSanitizer.sanitize(rawDescription)
+    metadata.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription)
  }

  var arrayFields = ['title', 'language', 'itunes:explicit', 'itunes:author', 'pubDate', 'link']
@@ -81,8 +82,9 @@ function extractEpisodeData(item) {
  }

  if (item['description']) {
-    episode.description = extractFirstArrayItem(item, 'description')
-    episode.descriptionPlain = stripHtml(episode.description || '').result
+    const rawDescription = extractFirstArrayItem(item, 'description') || ''
+    episode.description = htmlSanitizer.sanitize(rawDescription)
+    episode.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription)
  }

  var arrayFields = ['title', 'pubDate', 'itunes:episodeType', 'itunes:season', 'itunes:episode', 'itunes:author', 'itunes:duration', 'itunes:explicit', 'itunes:subtitle']