Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.IO;
- using System.Linq;
- using System.Net;
- using System.Text.RegularExpressions;
- namespace superuser
- {
- static class SUAnalysis
- {
- static Dictionary<int, SUHistoryContainer> _postHistoryData = new Dictionary<int, SUHistoryContainer>();
- static Dictionary<int, SUPostData> _postData = new Dictionary<int, SUPostData>();
- /// <summary>
- /// Gets the data out of the posts.xml file. Prerequisite for FromPostHistory to be run.
- /// </summary>
- /// <param name="path">Path to posts.xml</param>
- /// <param name="outputPath">Path to output file</param>
- public static void FromPosts(string path, string outputPath)
- {
- using (StreamReader reader = new StreamReader(path))
- {
- List<string> thedata = new List<string>();
- while (reader.Peek() >= 0)
- {
- string a = reader.ReadLine();
- if (!string.IsNullOrEmpty(a))
- {
- if (a.Contains("ClosedDate") && !a.Contains("PostTypeId=\"2\""))
- {
- thedata.Add(a);
- }
- }
- }
- Regex score = new Regex("(?<=Score=\")-?\\d+");
- thedata = (from line in thedata
- let scoreAsString = score.Match(line).Value
- orderby int.Parse(scoreAsString)
- select line).ToList();
- using (StreamWriter writer = new StreamWriter(outputPath))
- {
- foreach (string s in thedata)
- {
- writer.WriteLine(s);
- }
- }
- }
- }
- /// <summary>
- /// Gets the data from the posthistory.xml file, and then outputs its results to another file. Requires FromPosts to have been run beforehand.
- /// </summary>
- /// <param name="path">Path to posthitory.xml</param>
- /// <param name="pathToResults">Path to results output</param>
- public static void FromPostHistory(string path, string pathToResults)
- {
- //Dictionary<int, bool> thedata = new Dictionary<int, bool>();
- using (StreamReader resultsreader = new StreamReader(pathToResults))
- {
- Regex postId = new Regex("(?<=Id=\")\\d+");
- Regex postDate = new Regex("(?<=CreationDate=\")-?\\d+[-]\\d+");
- //postTitle and postBody by Indexof()
- Regex postScore = new Regex("(?<=Score=\")-?\\d+");
- while (resultsreader.Peek() >= 0)
- {
- string a = resultsreader.ReadLine();
- if (!string.IsNullOrEmpty(a))
- {
- if (a.Contains("ClosedDate") && !a.Contains("PostTypeId=\"2\""))
- {
- int b = int.Parse(postId.Match(a).Value);
- string c = a.Substring(a.IndexOf("Title=\"") + 7,
- a.IndexOf("Tags=\"") - (a.IndexOf("Title=\"") + 9));
- string d = a.Substring(a.IndexOf("Body=\"") + 6,
- (a.IndexOf("OwnerUserId=\"") == -1 ? a.IndexOf("LastActivityDate=\"") : a.IndexOf("OwnerUserId=\"")) - a.IndexOf("Body=\""));
- int e = int.Parse(postScore.Match(a).Value);
- int f = int.Parse(postDate.Match(a).Value.Remove(4, 1));
- _postData.Add(b, new SUPostData(c, d, e, f));
- }
- }
- }
- }
- using (StreamReader reader = new StreamReader(path))
- {
- Regex postId = new Regex("(?<=PostId=\")\\d+");
- Regex postType = new Regex("(?<=PostHistoryTypeId=\")\\d+");
- while (reader.Peek() >= 0)
- {
- string a = reader.ReadLine();
- if (!string.IsNullOrEmpty(a))
- {
- string historyTypeId = postType.Match(a).Value;
- if (postId.Match(a).Value != "")
- {
- int id = int.Parse(postId.Match(a).Value);
- switch (historyTypeId)
- {
- case "10":
- if (!a.Contains("CloseReasonId=\"1\""))
- AddSUHistory(a, true, ref _postHistoryData);
- break;
- case "11":
- AddSUHistory(a, false, ref _postHistoryData);
- break;
- case "12":
- _postHistoryData.Remove(id);
- break;
- case "13":
- AddSUHistory(a, true, ref _postHistoryData);
- break;
- case "17":
- _postHistoryData.Remove(id);
- break;
- case "22":
- AddSUHistory(a, false, ref _postHistoryData);
- break;
- }
- }
- }
- }
- }
- //Cleaner(_postHistoryData);
- using (StreamWriter writer = new StreamWriter(@"C:\Users\Daniel.HARUMPH\Downloads\Stack Overflow Data Dump - Jun 2011\Content\062011 Super User\result2.csv"))
- {
- writer.WriteLine("\"ID\",\"Score\",\"MetaScore\",\"Date\",\"KeywordScore\"");
- foreach (KeyValuePair<int, SUHistoryContainer> keypair in _postHistoryData)
- {
- if (keypair.Value.Closed)
- {
- writer.WriteLine("\"" + keypair.Key + "\",\"" + keypair.Value.Score + "\",\"" + keypair.Value.Metascore + "\",\"" + keypair.Value.YearMonth + "\",\"" + keypair.Value.KeywordScore + "\"");
- }
- }
- }
- }
- /// <summary>
- /// Use this to add a keypair to the collection.
- /// </summary>
- /// <param name="theline"> The line that contains the data you want added</param>
- /// <param name="isclosed"> Is the question currently closed (as opposed to deleted, re-opened, migrated, merged, etc</param>
- /// <param name="theref">The collection of keypairs that will be added to</param>
- public static void AddSUHistory(string theline, bool isclosed, ref Dictionary<int, SUHistoryContainer> theref)
- {
- Regex postId = new Regex("(?<=PostId=\")\\d+");
- Regex postDate = new Regex("(?<=CreationDate=\")-?\\d+[-]\\d+");
- int id = int.Parse(postId.Match(theline).Value);
- int date = int.Parse(postDate.Match(theline).Value.Remove(4, 1));
- if (!_postData.ContainsKey(id))
- {
- if (_postHistoryData.ContainsKey(id))
- {
- SUHistoryContainer temp = theref[id];
- temp.Closed = isclosed;
- theref[id] = temp;
- }
- else
- {
- theref.Add(id, new SUHistoryContainer(0, date, 0, isclosed));
- }
- }
- else
- {
- if (theref.ContainsKey(id))
- {
- SUHistoryContainer temp = theref[id];
- temp.Closed = isclosed;
- theref[id] = temp;
- }
- else
- {
- theref.Add(id,
- new SUHistoryContainer(_postData[id].Score, _postData[id].CreatedDate, KeywordScore(_postData[id].Title, _postData[id].Body),
- isclosed));
- }
- }
- }
- /// <summary>
- /// Score the post by its content. As of now, it is a crude fuction that just detects if certain words are there.
- /// </summary>
- /// <param name="title">The post title</param>
- /// <param name="body">The post body</param>
- /// <returns>A score that reprents the likelyhood it is garbage (lower = more likely)</returns>
- public static float KeywordScore(string title, string body)
- {
- Dictionary<string, float> criterion = new Dictionary<string, float> { { "mobile", 2 }, { "cell phone", 2 }, { "Iphone", 2 }, { "buy", 2 }, { "sell", 2 }, { "versus", .5f },
- { "best", .5f }, { "which is better", 2 }, { "what is the best", 2 }, { "cheap", 1 }, { "value", 1 }, { "worth it", 1 }, {"ipad",2},{"tablet",2},{"touchscreen",.5f},
- {"facebook",2},{"website",.5f},{"compared to",.5f},{"old",1},{"can be used for",1},{"shopping",3},{"funny",1},{"new",1}};
- float runningTotal = 0;
- foreach (KeyValuePair<string, float> pair in criterion)
- runningTotal += (title.ToUpper().Contains(pair.Key.ToUpper()) ? pair.Value * 2 : 0) + (body.ToUpper().Contains(pair.Key.ToUpper()) ? pair.Value : 0);
- return runningTotal * -1;
- }
- /// <summary>
- /// A helper function that is designed to check if the post was been removed since the last data dump. Called by Cleaner.
- /// </summary>
- /// <param name="url">URL to check</param>
- /// <returns>True if page exists, else false</returns>
- public static bool DoesPageExist(string url)
- {
- HttpWebRequest webRequestObject = (HttpWebRequest)WebRequest.Create(url);
- webRequestObject.UserAgent = ".NET Framework/4.0";
- //If the object can be created, the page exists. Else it doesn't.
- try
- {
- WebResponse response = webRequestObject.GetResponse();
- }
- catch (Exception ex)
- {
- if (ex.Message.Contains("404"))
- {
- return false;
- }
- }
- return true;
- }
- /// <summary>
- /// Removes those items in the collections that have already been removed from SU. Not effective as there appears to be a limit to the number of requests that one can make in a given time. Calls DoesPageExist.
- /// </summary>
- /// <param name="suHistoryContainers"></param>
- public static void Cleaner(Dictionary<int, SUHistoryContainer> suHistoryContainers)
- {
- //Its not good practice, but I got lazy.
- Dictionary<int, bool> temp = new Dictionary<int, bool>();
- foreach (KeyValuePair<int, SUHistoryContainer> suHistoryContainer in suHistoryContainers)
- {
- temp.Add(suHistoryContainer.Key,
- DoesPageExist("http://superuser.com/questions/" + suHistoryContainer.Key));
- }
- foreach (KeyValuePair<int, bool> keyValuePair in temp)
- {
- if (!keyValuePair.Value)
- {
- _postHistoryData.Remove(keyValuePair.Key);
- }
- }
- }
- }
- /// <summary>
- /// The container for data coming out of the posthistry.xml file.
- /// </summary>
- public struct SUHistoryContainer
- {
- public int Score;
- public int YearMonth;
- public float KeywordScore;
- public bool Isclosed;
- public bool Closed
- {
- get { return Isclosed; }
- set { Isclosed = value; }
- }
- public float Metascore
- {
- get
- {
- //Arbitrary.
- return Score > 10 ? -1 * Score / 3 : Score - (201109 - YearMonth) / 100 + KeywordScore;
- }
- }
- /// <summary>
- /// Constructor for SUHistory
- /// </summary>
- /// <param name="score">Post score</param>
- /// <param name="yearMonth">Post created date (yyyymm)</param>
- /// <param name="isClosed">Is it closed? (true==yes, false == no</param>
- public SUHistoryContainer(int score, int yearMonth, bool isClosed)
- : this(score, yearMonth, 0, isClosed)
- {
- }
- /// <summary>
- /// Constructor for SUHistory
- /// </summary>
- /// <param name="score">Post score</param>
- /// <param name="yearMonth">Post created date (yyyymm)</param>
- /// <param name="keywordScore">It's keyword score</param>
- /// <param name="isClosed">Is it closed? (true==yes, false == no</param>
- public SUHistoryContainer(int score, int yearMonth, float keywordScore, bool isClosed)
- {
- Score = score;
- YearMonth = yearMonth;
- KeywordScore = keywordScore;
- Isclosed = isClosed;
- }
- }
- /// <summary>
- /// The container for data coming out of the posts.xml file.
- /// </summary>
- public struct SUPostData
- {
- public string Title;
- public string Body;
- public int Score;
- public int CreatedDate;
- /// <summary>
- /// Constructor of SUpostData
- /// </summary>
- /// <param name="title">Post title</param>
- /// <param name="body">Post body</param>
- /// <param name="score">Post score</param>
- /// <param name="createdDate">Post creation date</param>
- public SUPostData(string title, string body, int score, int createdDate)
- {
- Title = title;
- Body = body;
- Score = score;
- CreatedDate = createdDate;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement