Advertisement
Guest User

Top habr comments

a guest
Oct 21st, 2012
1,056
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 2.36 KB | None | 0 0
  1. var wc = new WebClient(){Encoding=Encoding.UTF8};
  2.  
  3. Func<string, string> GetCachePath = s=>
  4. {
  5.         var cachePath = @"c:\temp\HabrCache";
  6.         var fileName = s.Replace( "/", "-" ).Replace(".", "-").Replace(":" , "-" );
  7.        fileName = Path.Combine(cachePath, fileName + ".html");
  8.         return fileName;
  9. };
  10.  
  11. Func<string, string> Download = s=>
  12. {
  13.         var fileName = GetCachePath(s);
  14.         if (File.Exists(fileName))
  15.        {
  16.                return File.ReadAllText(fileName);
  17.        }
  18.         else
  19.        {
  20.                var html = "";
  21.                try{                
  22.                      html = wc.DownloadString(s);
  23.               }
  24.                catch{}
  25.               File.WriteAllText(fileName, html);
  26.                return html;
  27.        }
  28. };
  29.  
  30. Func<string,int> ParseCommentRating = s =>
  31. {
  32.         return int.Parse(s.Replace( "–", "-" ));
  33. };
  34.  
  35. const string postUrlFormat="http://habrahabr.ru/post/{0}/";
  36. var lastPostHtml = Download("http://habrahabr.ru/posts/collective/new/" );
  37. var lastPostRegex = new Regex(string .Format(postUrlFormat, "([0-9]+)"));
  38. var match = lastPostRegex.Match(lastPostHtml);
  39. var lastPostId = int .Parse(match.Groups[1].Value);
  40. lastPostId.Dump("Last post id");
  41. var allComments = new List<dynamic>();
  42.  
  43. //var commentRegex = new Regex("<div class=\"comment item\".*?<div class=\"mark.*?<div class=\"message.*?\">(.*?)<//div>");
  44. var commentRegex = new Regex("<div class=\"comment_item\" id=\"(.*?)\".*?<span class=\"score\".*?>(.*?)</span>.*?<div class=\"message.*?\">(.*?)</div>" , RegexOptions.Singleline | RegexOptions.Compiled | RegexOptions.IgnoreCase);
  45.  
  46. // Take some recent posts and parse comments
  47. for (var id = lastPostId; id > lastPostId -500; id--)
  48. {
  49.         var postUrl = string.Format(postUrlFormat, id);
  50.         var postHtml = Download(postUrl);
  51.         //postHtml.Dump();
  52.         var comments = commentRegex.Matches(postHtml).OfType<Match>().Select(c=>
  53.         new {
  54.               Id=c.Groups[ 1].Value,
  55.               Score=ParseCommentRating(c.Groups[ 2].Value),
  56.               Text=c.Groups[ 3].Value.Trim(),
  57.               Url=Util.RawHtml( string.Format( "<a href='{0}'>{0}</a>" , postUrl.TrimEnd('/')+"#" +c.Groups[1].Value))
  58.               });
  59.        allComments.AddRange(comments);
  60. }
  61.  
  62. allComments.OrderByDescending(c=>c.Score).Take( 20).Dump();
  63. allComments.OrderBy(c=>c.Score).Take( 20).Dump();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement