Advertisement
hersonHN

Removing all the JS code from a HTML file using cheerio.js

Sep 7th, 2012
343
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. (function () {
  2.     "use strict";
  3.     // Load cheerio.js and fs for test some file
  4.     var fs = require("fs"),
  5.         cheerio = require('cheerio'),
  6.         file = 'tags.html';
  7.  
  8.  
  9.     function clearMarkup(rawHTML) {
  10.         var $ = cheerio.load(rawHTML);
  11.  
  12.         // First, we remove all the <script> tags
  13.         $("script").remove();
  14.         // then, on every tag we remove all the javascript attributes
  15.         // like, onclick, onload, etc...
  16.         $("*").each(function () {
  17.             var attribute, tag, attribs;
  18.             tag = $(this);
  19.             attribs = tag.get(0).attribs;
  20.  
  21.             for (attribute in attribs) {
  22.                 if (attribs.hasOwnProperty(attribute)) {
  23.                     if (attribute.toLowerCase().substr(0, 2) === "on") {
  24.                         tag.removeAttr(attribute);
  25.                     }
  26.                 }
  27.             }
  28.         });
  29.  
  30.         return $.html();
  31.     }
  32.  
  33.     // The test, there is no time for handling fs exceptions!!!
  34.     fs.readFile(file, function (error, data) {
  35.         var cleanHTML = clearMarkup(data);
  36.         console.log(cleanHTML);
  37.     });
  38. }());
  39.  
  40.  
  41. /* content of "tags.html" */
  42. /*
  43.     <!doctype html>
  44.     <html>
  45.     <head>
  46.         <title>test</title>
  47.     </head>
  48.     <body onload="alert('evil xss')">
  49.         <ul>
  50.             <li onmouseover="alert('evil XSS')">a</li>
  51.             <li>b</li>
  52.             <li>c</li>
  53.         </ul>
  54.         <script>alert("xss")</script>
  55.     </body>
  56.     </html>
  57.  
  58. */
  59.  
  60.  
  61. /* console output */
  62. /*
  63.     <!doctype html>
  64.     <html>
  65.     <head>
  66.         <title>test</title>
  67.     </head>
  68.     <body>
  69.         <ul>
  70.             <li>a</li>
  71.             <li>b</li>
  72.             <li>c</li>
  73.         </ul>
  74.  
  75.     </body>
  76.     </html>
  77.  
  78. */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement