Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- string[] scrapeTitles(M)(in M message)
- {
- static re_url = ctRegex!(r"(https?|ftp)://[^\s/$.?#].[^\s]*", "i");
- static re_title = ctRegex!(r"<title.*?>(.*?)<", "si");
- static re_ws = ctRegex!(r"(\s{2,}|\n|\t)");
- return matchAll(message, re_url)
- .map!( match => match.captures[0] )
- .map!( url => get(url, limitRange("0-4096")).ifThrown([]) ) // just first 4k
- .map!( content => matchFirst(cast(char[])content, re_title) )
- .array // cache to prevent multiple evaluations of preceding
- .filter!( capture => !capture.empty )
- .map!( capture => capture[1].idup.entitiesToUni )
- .map!( uni_title => uni_title.replaceAll(re_ws, " ") )
- .array
- .ifThrown(string[].init); // [] should work, possible bug
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement