Advertisement
emijrp

15m urls crawler

Dec 11th, 2011
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 1.00 KB | None | 0 0
  1. diff urls.seed.old urls.seed | grep -iE "^>" | cut -c 3- > urls.seed.diff ; cp urls.seed urls.seed-$(date +%Y%m%d%H%M) ; cp urls.seed urls.seed.old ; cat urls.seed.diff | xargs -n1 -i curl {} > htmls.acum ; grep -iEo "https?://[a-z0-9\-\.]+[a-z][ /\\\"\<\:\,\'\;\?\@]" htmls.acum | sed 's/\(.*\)./\1/' | sed 's/www\.//g' | sed 's/https:/http:/g' | grep -iEv 'files\.wordpress\.com' | sort | uniq | grep -iE "(15m|15dem|15demayo|15deoctubre|15o|acampada|actas|asamblea|calle|civic|comision|concentracion|crisis|democraciareal|dry[^\.]|desahucio|embarga|hipoteca|indigna|juventud|malestar|marcha|movimiento|nolesvotes|plataforma|primavera|referendum|sinfuturo|soltv|spanishrevolution|takethe|tomala|tomalos|wikispaces|yeswecamp)[a-z0-9\-\.]*\.[a-z]{2,}" >> urls.seed; cat urls.seed | sed 's/\.blogspot\.com\.es/\.blogspot\.com/g' > urls.seed.1; mv urls.seed.1 urls.seed; dd if=urls.seed of=urls.seed.lcase conv=lcase; mv urls.seed.lcase urls.seed; sort urls.seed | uniq > urls.seed.1 ; mv urls.seed.1 urls.seed; wc -l urls*
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement