Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* Do not forget adding a reference to System.Net.Http */
- using System;
- using System.Collections.Concurrent;
- using System.Collections.Generic;
- using System.Linq;
- using System.Net;
- using System.Net.Http;
- using System.Text.RegularExpressions;
- using System.Threading;
- using System.Threading.Tasks;
- namespace PasteScraper.Pastebin
- {
- /// <summary>
- /// Utility class for scraping the most recently posted pastes
- /// </summary>
- public class Scraper
- {
- /// <summary>
- /// Archive URL to get the most recently posted pastes from
- /// </summary>
- private const string ArchiveUrl = "http://pastebin.com/archive";
- /// <summary>
- /// Sets the default web proxy for HTTP requests to null, so
- /// the system will not try to resolve it (which causes a delay)
- /// </summary>
- static Scraper()
- {
- WebRequest.DefaultWebProxy = null;
- }
- /// <summary>
- /// Scrapse the most recently posted pastes. Method returns when all
- /// pastes have been downloaded, or the timeout elapsed
- /// </summary>
- /// <param name="timeout">Timeout for downloading pastes in seconds</param>
- /// <returns>Most recently posted pastes</returns>
- public static async Task<IEnumerable<Paste>> Scrape(int timeout)
- {
- const string urlPattern = ".*?(\"\").*?((?:\\/[\\w\\.\\-]+)+)";
- const int relativeUrlLength = 9;
- string pageSource;
- using (var client = new HttpClient()) {
- pageSource = await client.GetStringAsync(new Uri(ArchiveUrl, UriKind.Absolute));
- }
- var urls = (from matches in Regex.Matches(pageSource, urlPattern).Cast<Match>()
- let url = matches.Groups[2].Value
- where url.Length == relativeUrlLength
- select url).ToList();
- var pastes = new ConcurrentBag<Paste>();
- urls.ToList().ForEach(url
- => ThreadPool.QueueUserWorkItem(async state
- => pastes.Add(await Paste.ParseFromUrlAsync(url))));
- var count = 0;
- while (pastes.Count != urls.Count) {
- await Task.Delay(TimeSpan.FromSeconds(1));
- if (++count == (timeout > 0 ? timeout : 10)) {
- break;
- }
- }
- return pastes;
- }
- }
- /// <summary>
- /// Contains information regarding a parsed paste
- /// </summary>
- public class Paste
- {
- /// <summary>
- /// The paste's content
- /// </summary>
- public string Content { get; private set; }
- /// <summary>
- /// The paste's ID
- /// </summary>
- public string Id { get; private set; }
- /// <summary>
- /// Creates a new Paste object and initializes the content
- /// </summary>
- /// <param name="content">The paste's content</param>
- /// <param name="id">The paste's ID</param>
- public Paste(string content, string id)
- {
- Content = content;
- Id = id;
- }
- /// <summary>
- /// Parses a new PastebinResult object from a paste's HTML source
- /// </summary>
- /// <param name="relativeUrl">A paste's relative URL ("/id")</param>
- /// <returns>New PastebinResult object from a paste's HTML source</returns>
- public static Task<Paste> ParseFromUrlAsync(string relativeUrl)
- {
- const int relativeUrlLength = 9;
- //Eagerly validate argument and throw exception if invalid
- if (!relativeUrl.StartsWith("/") || relativeUrl.Length != relativeUrlLength) {
- throw new ArgumentException("The relative URL passed is invalid.");
- }
- //Skip the forward slash to get the ID
- return ParseFromUrlAsyncImpl(relativeUrl.Substring(1));
- }
- /// <summary>
- /// Parses a new PastebinResult object from a paste's HTML source
- /// </summary>
- /// <param name="id">A paste's ID</param>
- /// <returns>New PastebinResult object from a paste's HTML source</returns>
- private static async Task<Paste> ParseFromUrlAsyncImpl(string id)
- {
- const string baseUrl = "http://pastebin.com/raw.php?i=";
- string pageSource;
- using (var client = new HttpClient()) {
- pageSource = await client.GetStringAsync(baseUrl + id);
- }
- return new Paste(pageSource, id);
- }
- /// <summary>
- /// Formats the pastebin result to display the content
- /// </summary>
- /// <returns>Formatted string containing the pastebin
- /// result to display the content</returns>
- public override string ToString()
- {
- return string.Format("{0}:\n{1}", Id, Content);
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement