Advertisement
Guest User

habrahabr

a guest
Dec 13th, 2012
2,647
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 2.31 KB | None | 0 0
  1. getPosts<-function(url, cookie) {
  2.   cat("Retriving data for: ", url,"\n")
  3.   html<-getURL(url, cookie=cookie)
  4.   doc<-htmlParse(html)
  5.   published<-xpathSApply(doc, "//div[@class='published']", xmlValue)
  6.   pageviews<-xpathSApply(doc, "//div[@class='pageviews']", xmlValue)
  7.   favs<-xpathSApply(doc, "//div[@class='favs_count']", xmlValue)
  8.   scoredetailes<-xpathSApply(doc, "//span[@class='score']", xmlGetAttr, "title")
  9.   scores<-xpathSApply(doc, "//span[@class='score']", xmlValue)
  10.   comments<-xpathSApply(doc, "//span[@class='all']", xmlValue)
  11.   hrefs<-xpathSApply(doc, "//a[@class='post_title']", xmlGetAttr, "href")
  12.   posts<-data.frame(hrefs, published, scoredetailes, scores, pageviews, favs, comments)
  13.   scoressplitted<-sapply(strsplit(as.character(posts$scoredetailes), "\\D+", perl=TRUE),unlist)
  14.   if(class(scoressplitted)=="matrix" && dim(scoressplitted)[1]==4)
  15.   {
  16.     scoressplitted<-t(scoressplitted[2:4,])  
  17.     posts$actions<-as.numeric(as.character(scoressplitted[,1]))
  18.     posts$plusactions<-as.numeric(as.character(scoressplitted[,2]))
  19.     posts$minusactions<-as.numeric(as.character(scoressplitted[,3]))
  20.   }
  21.  
  22.   posts$comments<-as.numeric(as.character(posts$comments))
  23.   posts$scores<-as.numeric(as.character(posts$scores))
  24.   posts$favs<-as.numeric(as.character(posts$favs))
  25.   posts$pageviews<-as.numeric(as.character(posts$pageviews))
  26.  
  27.   posts$published<-sub(" декабря в ","/12/2012 ",as.character(posts$published))
  28.   posts$published<-sub(" ноября в ","/11/2012 ",posts$published)
  29.   posts$published<-sub(" октября в ","/10/2012 ",posts$published)
  30.   posts$published<-sub(" сентября в ","/09/2012 ",posts$published)
  31.   posts$published<-sub("^ ","",posts$published)
  32.   posts$publishedDate<-as.Date(posts$published, format="%d/%m/%Y %H:%M")
  33.   posts$weekDay<-format(posts$publishedDate, "%A")
  34.   return(posts)
  35. }
  36.  
  37. getPostsForPages<-function(pages, cookie, sleep=0)
  38. {
  39.   urls<-paste("http://habrahabr.ru/feed/posts/habred/page", pages, "/", sep="")
  40.   ret<-data.frame()
  41.   for(url in urls)
  42.   {
  43.     ret<-rbind(ret, getPosts(url, cookie))
  44.     Sys.sleep(sleep)
  45.   }
  46.   return(ret)
  47. }
  48.  
  49. showHabraHabrStats<-function(posts)
  50. {
  51.   plot(posts[,8:10])
  52.   plot(posts[posts$actions<100,9:11], col=posts$weekDay)
  53.   hist(posts[posts$comments<200,"comments"], breaks=60, col=rainbow(60))
  54. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement