Advertisement
thetenfold

get-text-links__JoeSimmons

Jun 14th, 2013
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. var blacklist = "," + [
  2.  
  3.     "a", // don't get text from anchor nodes
  4.     "script" // or script elements
  5.  
  6. ].join(",");
  7.  
  8. var item, items = document.evaluate("//body//text()[contains(., 'http://')]", document, null, 6, null),
  9.     urlRegex = /https?:\/\/([a-z0-9-]+\.)?([a-z0-9-]+\.)+[a-z0-9-]+(\/|\s|$)[^\s]*/i,
  10.     urlRegexg = /https?:\/\/([a-z0-9-]+\.)?([a-z0-9-]+\.)+[a-z0-9-]+(\/|\s|$)[^\s]*/i,
  11.     URLs = [], spaceRegex = /\s+/, tmp, txt, i, j,
  12.     parent = function(e) {
  13.         while((e = e.parentNode).nodeType !== 1) {}
  14.         return e;
  15.     }
  16.  
  17. for(i = 0, item; i < items.snapshotLength; i++) {
  18.     item = items.snapshotItem(i);
  19.     txt = item.textContent;
  20.     tmp = txt.split(spaceRegex);
  21.     if(blacklist.indexOf(("," + parent(item).tagName.toLowerCase())) === -1 && urlRegex.test(txt)) {
  22.         if(tmp.length > 1) { // if > 1 url in text node
  23.             for(j = 0; j < tmp.length; j++) {
  24.                 if(urlRegex.test(tmp[j])) URLs.push(tmp[j].match(urlRegex)[0]);
  25.             }
  26.         } else { // if only 1 url in text node
  27.             URLs.push(txt.match(urlRegex)[0]);
  28.         }
  29.     }
  30. }
  31.  
  32.  
  33. var result = URLs.length > 0 ? URLs.join("\n") : "No URL matches";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement