Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- protected async Task<List<string>> DownloadUrls(string url)
- {
- var req = HttpWebRequest.CreateHttp(url);
- var res = (HttpWebResponse)req.GetResponse();
- if (!res.IsHtmlPage() || res.StatusCode != HttpStatusCode.OK) {
- return new List<string>();
- }
- var content = await Task.Factory.StartNew(() => req.GetResponse().Download());
- return await Task.Factory.StartNew(() => {
- var dom = CsQuery.CQ.CreateDocument(content);
- return dom["a"]
- .Select(l => MakeAbsolute(l.GetAttribute("href"), this.BaseUrl))
- .ToList();
- });
- }
- protected async Task<List<string>[]> DownloadAsync(List<string> urls)
- {
- //Logger.Log(String.Join("\n", urls), LoggerMessageType.Info);
- using (var semaphore = new SemaphoreSlim(5)) {
- var tasks = urls.Select(async (url) => {
- await semaphore.WaitAsync();
- try {
- var fetchedUrls = await DownloadUrls(url);
- return fetchedUrls;
- } finally {
- semaphore.Release();
- }
- });
- return await Task.WhenAll(tasks);
- }
- }
- protected async void RecursiveParse(string baseHost)
- {
- var urls = await DownloadAsync(new List<string> { baseHost });
- foreach (var url in urls) {
- RecursiveParse(url, 3 /* depth of recursion */);
- }
- }
- protected async void RecursiveParse(List<string> urls, int depth)
- {
- if (depth <= 0) {
- return;
- }
- var newLinks = await DownloadAsync(urls);
- var y = newLinks.SelectMany(x => x).Distinct().ToList();
- RecursiveParse(y, depth - 1);
- }
Advertisement
Add Comment
Please, Sign In to add comment