Advertisement
soandos

SU post analysis code

Sep 4th, 2011
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 13.95 KB | None | 0 0
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. using System.Net;
  6. using System.Text.RegularExpressions;
  7.  
  8. namespace superuser
  9. {
  10.     static class SUAnalysis
  11.     {
  12.         static Dictionary<int, SUHistoryContainer> _postHistoryData = new Dictionary<int, SUHistoryContainer>();
  13.         static Dictionary<int, SUPostData> _postData = new Dictionary<int, SUPostData>();
  14.  
  15.         /// <summary>
  16.         /// Gets the data out of the posts.xml file. Prerequisite for FromPostHistory to be run.
  17.         /// </summary>
  18.         /// <param name="path">Path to posts.xml</param>
  19.         /// <param name="outputPath">Path to output file</param>
  20.         public static void FromPosts(string path, string outputPath)
  21.         {
  22.             using (StreamReader reader = new StreamReader(path))
  23.             {
  24.                 List<string> thedata = new List<string>();
  25.                 while (reader.Peek() >= 0)
  26.                 {
  27.                     string a = reader.ReadLine();
  28.                     if (!string.IsNullOrEmpty(a))
  29.                     {
  30.                         if (a.Contains("ClosedDate") && !a.Contains("PostTypeId=\"2\""))
  31.                         {
  32.                             thedata.Add(a);
  33.                         }
  34.                     }
  35.                 }
  36.                 Regex score = new Regex("(?<=Score=\")-?\\d+");
  37.                 thedata = (from line in thedata
  38.                            let scoreAsString = score.Match(line).Value
  39.                            orderby int.Parse(scoreAsString)
  40.                            select line).ToList();
  41.                 using (StreamWriter writer = new StreamWriter(outputPath))
  42.                 {
  43.                     foreach (string s in thedata)
  44.                     {
  45.                         writer.WriteLine(s);
  46.                     }
  47.                 }
  48.             }
  49.         }
  50.         /// <summary>
  51.         /// Gets the data from the posthistory.xml file, and then outputs its results to another file. Requires FromPosts to have been run beforehand.
  52.         /// </summary>
  53.         /// <param name="path">Path to posthitory.xml</param>
  54.         /// <param name="pathToResults">Path to results output</param>
  55.         public static void FromPostHistory(string path, string pathToResults)
  56.         {
  57.             //Dictionary<int, bool> thedata = new Dictionary<int, bool>();
  58.             using (StreamReader resultsreader = new StreamReader(pathToResults))
  59.             {
  60.                 Regex postId = new Regex("(?<=Id=\")\\d+");
  61.                 Regex postDate = new Regex("(?<=CreationDate=\")-?\\d+[-]\\d+");
  62.                 //postTitle and postBody by Indexof()
  63.                 Regex postScore = new Regex("(?<=Score=\")-?\\d+");
  64.                 while (resultsreader.Peek() >= 0)
  65.                 {
  66.                     string a = resultsreader.ReadLine();
  67.                     if (!string.IsNullOrEmpty(a))
  68.                     {
  69.                         if (a.Contains("ClosedDate") && !a.Contains("PostTypeId=\"2\""))
  70.                         {
  71.                             int b = int.Parse(postId.Match(a).Value);
  72.                             string c = a.Substring(a.IndexOf("Title=\"") + 7,
  73.                                                    a.IndexOf("Tags=\"") - (a.IndexOf("Title=\"") + 9));
  74.                             string d = a.Substring(a.IndexOf("Body=\"") + 6,
  75.                                 (a.IndexOf("OwnerUserId=\"") == -1 ? a.IndexOf("LastActivityDate=\"") : a.IndexOf("OwnerUserId=\"")) - a.IndexOf("Body=\""));
  76.                             int e = int.Parse(postScore.Match(a).Value);
  77.                             int f = int.Parse(postDate.Match(a).Value.Remove(4, 1));
  78.                             _postData.Add(b, new SUPostData(c, d, e, f));
  79.                         }
  80.                     }
  81.                 }
  82.             }
  83.             using (StreamReader reader = new StreamReader(path))
  84.             {
  85.                 Regex postId = new Regex("(?<=PostId=\")\\d+");
  86.                 Regex postType = new Regex("(?<=PostHistoryTypeId=\")\\d+");
  87.                 while (reader.Peek() >= 0)
  88.                 {
  89.                     string a = reader.ReadLine();
  90.                     if (!string.IsNullOrEmpty(a))
  91.                     {
  92.                         string historyTypeId = postType.Match(a).Value;
  93.                         if (postId.Match(a).Value != "")
  94.                         {
  95.                             int id = int.Parse(postId.Match(a).Value);
  96.                             switch (historyTypeId)
  97.                             {
  98.                                 case "10":
  99.                                     if (!a.Contains("CloseReasonId=\"1\""))
  100.                                         AddSUHistory(a, true, ref _postHistoryData);
  101.                                     break;
  102.                                 case "11":
  103.                                     AddSUHistory(a, false, ref _postHistoryData);
  104.                                     break;
  105.                                 case "12":
  106.                                     _postHistoryData.Remove(id);
  107.                                     break;
  108.                                 case "13":
  109.                                     AddSUHistory(a, true, ref _postHistoryData);
  110.                                     break;
  111.                                 case "17":
  112.                                     _postHistoryData.Remove(id);
  113.                                     break;
  114.                                 case "22":
  115.                                     AddSUHistory(a, false, ref _postHistoryData);
  116.                                     break;
  117.                             }
  118.                         }
  119.                     }
  120.                 }
  121.             }
  122.             //Cleaner(_postHistoryData);
  123.             using (StreamWriter writer = new StreamWriter(@"C:\Users\Daniel.HARUMPH\Downloads\Stack Overflow Data Dump - Jun 2011\Content\062011 Super User\result2.csv"))
  124.             {
  125.                 writer.WriteLine("\"ID\",\"Score\",\"MetaScore\",\"Date\",\"KeywordScore\"");
  126.                 foreach (KeyValuePair<int, SUHistoryContainer> keypair in _postHistoryData)
  127.                 {
  128.                     if (keypair.Value.Closed)
  129.                     {
  130.                         writer.WriteLine("\"" + keypair.Key + "\",\"" + keypair.Value.Score + "\",\"" + keypair.Value.Metascore + "\",\"" + keypair.Value.YearMonth + "\",\"" + keypair.Value.KeywordScore + "\"");
  131.                     }
  132.                 }
  133.             }
  134.         }
  135.         /// <summary>
  136.         /// Use this to add a keypair to the collection.
  137.         /// </summary>
  138.         /// <param name="theline"> The line that contains the data you want added</param>
  139.         /// <param name="isclosed"> Is the question currently closed (as opposed to deleted, re-opened, migrated, merged, etc</param>
  140.         /// <param name="theref">The collection of keypairs that will be added to</param>
  141.         public static void AddSUHistory(string theline, bool isclosed, ref Dictionary<int, SUHistoryContainer> theref)
  142.         {
  143.             Regex postId = new Regex("(?<=PostId=\")\\d+");
  144.             Regex postDate = new Regex("(?<=CreationDate=\")-?\\d+[-]\\d+");
  145.             int id = int.Parse(postId.Match(theline).Value);
  146.             int date = int.Parse(postDate.Match(theline).Value.Remove(4, 1));
  147.             if (!_postData.ContainsKey(id))
  148.             {
  149.                 if (_postHistoryData.ContainsKey(id))
  150.                 {
  151.                     SUHistoryContainer temp = theref[id];
  152.                     temp.Closed = isclosed;
  153.                     theref[id] = temp;
  154.                 }
  155.                 else
  156.                 {
  157.                     theref.Add(id, new SUHistoryContainer(0, date, 0, isclosed));
  158.                 }
  159.             }
  160.             else
  161.             {
  162.                 if (theref.ContainsKey(id))
  163.                 {
  164.                     SUHistoryContainer temp = theref[id];
  165.                     temp.Closed = isclosed;
  166.                     theref[id] = temp;
  167.                 }
  168.                 else
  169.                 {
  170.                     theref.Add(id,
  171.                         new SUHistoryContainer(_postData[id].Score, _postData[id].CreatedDate, KeywordScore(_postData[id].Title, _postData[id].Body),
  172.                             isclosed));
  173.                 }
  174.             }
  175.         }
  176.         /// <summary>
  177.         /// Score the post by its content. As of now, it is a crude fuction that just detects if certain words are there.
  178.         /// </summary>
  179.         /// <param name="title">The post title</param>
  180.         /// <param name="body">The post body</param>
  181.         /// <returns>A score that reprents the likelyhood it is garbage (lower = more likely)</returns>
  182.         public static float KeywordScore(string title, string body)
  183.         {
  184.             Dictionary<string, float> criterion = new Dictionary<string, float> { { "mobile", 2 }, { "cell phone", 2 }, { "Iphone", 2 }, { "buy", 2 }, { "sell", 2 }, { "versus", .5f },
  185.             { "best", .5f }, { "which is better", 2 }, { "what is the best", 2 }, { "cheap", 1 }, { "value", 1 }, { "worth it", 1 }, {"ipad",2},{"tablet",2},{"touchscreen",.5f},
  186.             {"facebook",2},{"website",.5f},{"compared to",.5f},{"old",1},{"can be used for",1},{"shopping",3},{"funny",1},{"new",1}};
  187.             float runningTotal = 0;
  188.             foreach (KeyValuePair<string, float> pair in criterion)
  189.                 runningTotal += (title.ToUpper().Contains(pair.Key.ToUpper()) ? pair.Value * 2 : 0) + (body.ToUpper().Contains(pair.Key.ToUpper()) ? pair.Value : 0);
  190.             return runningTotal * -1;
  191.         }
  192.         /// <summary>
  193.         /// A helper function that is designed to check if the post was been removed since the last data dump. Called by Cleaner.
  194.         /// </summary>
  195.         /// <param name="url">URL to check</param>
  196.         /// <returns>True if page exists, else false</returns>
  197.         public static bool DoesPageExist(string url)
  198.         {
  199.             HttpWebRequest webRequestObject = (HttpWebRequest)WebRequest.Create(url);
  200.             webRequestObject.UserAgent = ".NET Framework/4.0";
  201.             //If the object can be created, the page exists. Else it doesn't.
  202.             try
  203.             {
  204.                 WebResponse response = webRequestObject.GetResponse();
  205.             }
  206.             catch (Exception ex)
  207.             {
  208.                 if (ex.Message.Contains("404"))
  209.                 {
  210.                     return false;
  211.                 }
  212.             }
  213.             return true;
  214.         }
  215.         /// <summary>
  216.         /// Removes those items in the collections that have already been removed from SU. Not effective as there appears to be a limit to the number of requests that one can make in a given time. Calls DoesPageExist.
  217.         /// </summary>
  218.         /// <param name="suHistoryContainers"></param>
  219.         public static void Cleaner(Dictionary<int, SUHistoryContainer> suHistoryContainers)
  220.         {
  221.             //Its not good practice, but I got lazy.
  222.             Dictionary<int, bool> temp = new Dictionary<int, bool>();
  223.             foreach (KeyValuePair<int, SUHistoryContainer> suHistoryContainer in suHistoryContainers)
  224.             {
  225.                 temp.Add(suHistoryContainer.Key,
  226.                          DoesPageExist("http://superuser.com/questions/" + suHistoryContainer.Key));
  227.             }
  228.             foreach (KeyValuePair<int, bool> keyValuePair in temp)
  229.             {
  230.                 if (!keyValuePair.Value)
  231.                 {
  232.                     _postHistoryData.Remove(keyValuePair.Key);
  233.                 }
  234.             }
  235.         }
  236.     }
  237.     /// <summary>
  238.     /// The container for data coming out of the posthistry.xml file.
  239.     /// </summary>
  240.     public struct SUHistoryContainer
  241.     {
  242.         public int Score;
  243.         public int YearMonth;
  244.         public float KeywordScore;
  245.         public bool Isclosed;
  246.         public bool Closed
  247.         {
  248.             get { return Isclosed; }
  249.             set { Isclosed = value; }
  250.         }
  251.         public float Metascore
  252.         {
  253.             get
  254.             {
  255.                 //Arbitrary.
  256.                 return Score > 10 ? -1 * Score / 3 : Score - (201109 - YearMonth) / 100 + KeywordScore;
  257.             }
  258.         }
  259.         /// <summary>
  260.         /// Constructor for SUHistory
  261.         /// </summary>
  262.         /// <param name="score">Post score</param>
  263.         /// <param name="yearMonth">Post created date (yyyymm)</param>
  264.         /// <param name="isClosed">Is it closed? (true==yes, false == no</param>
  265.         public SUHistoryContainer(int score, int yearMonth, bool isClosed)
  266.             : this(score, yearMonth, 0, isClosed)
  267.         {
  268.  
  269.         }
  270.         /// <summary>
  271.         /// Constructor for SUHistory
  272.         /// </summary>
  273.         /// <param name="score">Post score</param>
  274.         /// <param name="yearMonth">Post created date (yyyymm)</param>
  275.         /// <param name="keywordScore">It's keyword score</param>
  276.         /// <param name="isClosed">Is it closed? (true==yes, false == no</param>
  277.         public SUHistoryContainer(int score, int yearMonth, float keywordScore, bool isClosed)
  278.         {
  279.             Score = score;
  280.             YearMonth = yearMonth;
  281.             KeywordScore = keywordScore;
  282.             Isclosed = isClosed;
  283.         }
  284.     }
  285.     /// <summary>
  286.     /// The container for data coming out of the posts.xml file.
  287.     /// </summary>
  288.     public struct SUPostData
  289.     {
  290.         public string Title;
  291.         public string Body;
  292.         public int Score;
  293.         public int CreatedDate;
  294.         /// <summary>
  295.         /// Constructor of SUpostData
  296.         /// </summary>
  297.         /// <param name="title">Post title</param>
  298.         /// <param name="body">Post body</param>
  299.         /// <param name="score">Post score</param>
  300.         /// <param name="createdDate">Post creation date</param>
  301.         public SUPostData(string title, string body, int score, int createdDate)
  302.         {
  303.             Title = title;
  304.             Body = body;
  305.             Score = score;
  306.             CreatedDate = createdDate;
  307.         }
  308.     }
  309. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement