rplantiko

Extract base64 inline images from HTML code

May 23rd, 2023 (edited)
173
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. "use strict";
  2.  
  3. /*
  4.  
  5.   Graphics pasted from the clipboard into the TinyMCE rich text editor will be inserted
  6.   into the HTML source code of the edited document as an <img src="image.png;base64,..."> tag,
  7.   with the base64 encoded data of the image in its 'src' attribute.
  8.  
  9.   For several reasons, it is better to have the images in separate files instead,
  10.   which are just referred to in the HTML source code by a relative URL  
  11.  
  12.   This script
  13.   - finds the base64 inline images
  14.   - extracts the base64 data and saves it as a file with an auto-generated name
  15.   - replaces the src attribute in the HTML code with this filename  
  16.   - saves the adapted HTML file  
  17.  
  18.   We used nodejs with the package 'node-html-parser', which is very easy to use for such tasks
  19.  
  20. */
  21.  
  22. const fs      = require('fs').promises;
  23. const html    = require('node-html-parser');
  24.  
  25. const IMGDIR = 'C:\\\\Temp\\sample\\';         // The directory for the extracted images
  26. const IMGDIR_RELATIVE = 'sample\\';            // Relative name of that directory, as seen from the HTML file location
  27. const HTMLFILE = 'C:\\\\Temp\\sample.html';    // Absolute path of the HTML file containing base64 encoded images
  28.  
  29. run();
  30.  
  31. async function run() {
  32.   try {
  33. // Read the HTML file
  34.     const fileContent = await fs.readFile(HTMLFILE, 'utf8');
  35. // Extract images and replace the src attributes by their relative URL
  36.     const images = extractImages( fileContent );
  37. // Save the images
  38.     for (let img of images) saveToFile(...img);
  39. // Some statistics
  40.     console.log("Total # of images: "+images.length);
  41.     console.log("Total amount of base64 characters: "+images.reduce((acc,[,data])=>acc+data.length,0));
  42.   } catch(err) {
  43.     console.error(err);
  44.   }
  45.  
  46. // Save the image
  47. // Leave the decoding "base64 -> binary image" to the writeFile function
  48.   function saveToFile(fileName,data) {
  49.     fs.writeFile(IMGDIR+fileName, data, {encoding: 'base64'}, function(err) {
  50.       if (err) console.err('Error:',err);
  51.       }
  52.     );     
  53.   }
  54. }
  55.  
  56. // Extracts the base64 encoded images into an array of pairs [fileName,data]
  57. // and returns this array
  58. // Replaces the src attributes of the images by the fileName URL
  59. // Saves the modified HTML file into a file with additional suffix ".new"
  60. function extractImages(htmlContent) {
  61.     const doc = html.parse( htmlContent );
  62.     const imgElements = doc.querySelectorAll("img");
  63.     let images = [], i = 0;
  64.     for (let img of imgElements) {
  65.       try {
  66.         let [fileName,] = pair = extractSingleImage(img,++i);
  67.         images.push(pair);
  68.         img.setAttribute("src",IMGDIR_RELATIVE+fileName);
  69.       }
  70.       catch(err){
  71.         console.log(err);
  72.       }
  73.     });
  74.  
  75.     fs.writeFile(HTMLFILE+".new",doc+"");
  76.     return images;
  77.    
  78. // Returns the pair [fileName,data] extracted from a single <img> element
  79. // Generates a name of the form "imageNNNN.png" (for example) from MIME type "image/png"
  80.     function extractSingleImage(img,i) {
  81.         let a,data,fileName;
  82.         let src = img.getAttribute("src");
  83.         if (src) {
  84.           [,a,data] = src.match(/data:([^;]*);base64,\s*(.*)/);
  85.           fileName = a.replace(/\//,(i+"").padStart(4,"0")+".");
  86.           return [fileName,data];
  87.         }
  88.         else {
  89.             throw "No src attribute for IMG tag:"+(img+"").substring(1,100);
  90.         }
  91.     }
  92. }
  93.  
  94.  
  95.  
  96.  
Add Comment
Please, Sign In to add comment