Want more features on Pastebin? Sign Up, it's FREE!
Guest

Prefix tree find-and-replace.

By: a guest on Oct 11th, 2012  |  syntax: JavaScript  |  size: 4.47 KB  |  views: 27  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. function buildTrie(hash) {
  2.     "use strict";
  3.     // A very simple function to build a Trie
  4.     // we could compress this later, but simplicity
  5.     // is better for this example. If we don't
  6.     // perform well, we'll try to optimize this a bit
  7.     // there is a room for optimization here.
  8.     var p, result = {}, leaf, i;
  9.     for (p in hash) {
  10.         if (hash.hasOwnProperty(p)) {
  11.             leaf = result;
  12.             i = 0;
  13.             do {
  14.                 if (p[i] in leaf) {
  15.                     leaf = leaf[p[i]];
  16.                 } else {
  17.                     leaf = leaf[p[i]] = {};
  18.                 }
  19.                 i += 1;
  20.             } while (i < p.length);
  21.             // since, obviously, no character
  22.             // equals to empty character, we'll
  23.             // use it to store the reference to the
  24.             // original value
  25.             leaf[""] = hash[p];
  26.         }
  27.     }
  28.     return result;
  29. }
  30.  
  31. function prefixReplaceHtml(html, trie) {
  32.     "use strict";
  33.     var i, len = html.length, result = [], lastMatch = 0,
  34.         current, leaf, match, matched, replacement;
  35.     for (i = 0; i < len; i += 1) {
  36.         current = html[i];
  37.         if (current === "<") {
  38.             // don't check for out of bounds access
  39.             // assume we never face a situation, when
  40.             // "<" is the last character in an HTML
  41.             if (match) {
  42.                 result.push(
  43.                     html.substring(lastMatch, i - matched.length),
  44.                     "<a href=\"", match, "\">", replacement, "</a>");
  45.                 lastMatch = i - matched.length + replacement.length;
  46.                 i = lastMatch - 1;
  47.             } else {
  48.                 if (matched) {
  49.                     // go back to the second character of the
  50.                     // matched string and try again
  51.                     i = i - matched.length;
  52.                 }
  53.             }
  54.             matched = match = replacement = leaf = "";
  55.             if (html[i + 1] === "a") {
  56.                 // we want to skip replacing inside
  57.                 // anchor tags. We also assume they
  58.                 // are never nested, as valid HTML is
  59.                 // against that idea
  60.                 if (html[i + 2] in
  61.                     { " " : 1, "\t" : 1, "\r" : 1, "\n" : 1 }) {
  62.                     // this is certainly an anchor
  63.                     i = html.indexOf("</a", i + 3) + 3;
  64.                     continue;
  65.                 }
  66.             }
  67.             // if we got here, it's a regular tag, just look
  68.             // for terminating ">"
  69.             i = html.indexOf(">", i + 1);
  70.             continue;
  71.         }
  72.         // if we got here, we need to start checking
  73.         // for the match in the trie
  74.         if (!leaf) {
  75.             leaf = trie;
  76.         }
  77.         leaf = leaf[current];
  78.         // we prefer longest possible match, just like POSIX
  79.         // regular expressions do
  80.         if (leaf && ("" in leaf)) {
  81.             match = leaf[""];
  82.             replacement = html.substring(
  83.                 i - (matched ? matched.length : 0), i + 1);
  84.         }
  85.         if (!leaf) {
  86.             // newby-style inline (all hand work!) pay extra
  87.             // attention, this code is duplicated few lines above
  88.             if (match) {
  89.                 result.push(
  90.                     html.substring(lastMatch, i - matched.length),
  91.                     "<a href=\"", match, "\">", replacement, "</a>");
  92.                 lastMatch = i - matched.length + replacement.length;
  93.                 i = lastMatch - 1;
  94.             } else {
  95.                 if (matched) {
  96.                     // go back to the second character of the
  97.                     // matched string and try again
  98.                     i = i - matched.length;
  99.                 }
  100.             }
  101.             matched = match = replacement = "";
  102.         } else if (matched) {
  103.             // perhaps a bit premature, but we'll try to avoid
  104.             // string concatenation, when we can.
  105.             matched = html.substring(i - matched.length, i + 1);
  106.         } else {
  107.             matched = current;
  108.         }
  109.     }
  110.     return result.join("");
  111. }
  112.            
  113. function testPrefixReplace() {
  114.     "use strict";
  115.     var trie = buildTrie(
  116.         { "x" : "www.xxx.com", "yyy" : "www.y.com",
  117.           "xy" : "www.xy.com", "yy" : "www.why.com" });
  118.     // return trie["y"]["y"]["y"];
  119.     return prefixReplaceHtml(
  120.         "<html><head>x</head><body><a >yyy</a><p>" +
  121.             "xyyy yy x xy</p><abrval><yy>xxy</yy>", trie);
  122. }
clone this paste RAW Paste Data