data set: webdevdata.org-2013-09-01-201332 $ find ./ -name "*.html.txt" -print0 | xargs -0 -n1 -P8 grep -Eio "<[^>]+\s(href|src)\s*=\s*[\"']?https?://[^ \"'/?#]*[%&][^>]+>"