Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- getPosts<-function(url, cookie) {
- cat("Retriving data for: ", url,"\n")
- html<-getURL(url, cookie=cookie)
- doc<-htmlParse(html)
- published<-xpathSApply(doc, "//div[@class='published']", xmlValue)
- pageviews<-xpathSApply(doc, "//div[@class='pageviews']", xmlValue)
- favs<-xpathSApply(doc, "//div[@class='favs_count']", xmlValue)
- scoredetailes<-xpathSApply(doc, "//span[@class='score']", xmlGetAttr, "title")
- scores<-xpathSApply(doc, "//span[@class='score']", xmlValue)
- comments<-xpathSApply(doc, "//span[@class='all']", xmlValue)
- hrefs<-xpathSApply(doc, "//a[@class='post_title']", xmlGetAttr, "href")
- posts<-data.frame(hrefs, published, scoredetailes, scores, pageviews, favs, comments)
- scoressplitted<-sapply(strsplit(as.character(posts$scoredetailes), "\\D+", perl=TRUE),unlist)
- if(class(scoressplitted)=="matrix" && dim(scoressplitted)[1]==4)
- {
- scoressplitted<-t(scoressplitted[2:4,])
- posts$actions<-as.numeric(as.character(scoressplitted[,1]))
- posts$plusactions<-as.numeric(as.character(scoressplitted[,2]))
- posts$minusactions<-as.numeric(as.character(scoressplitted[,3]))
- }
- posts$comments<-as.numeric(as.character(posts$comments))
- posts$scores<-as.numeric(as.character(posts$scores))
- posts$favs<-as.numeric(as.character(posts$favs))
- posts$pageviews<-as.numeric(as.character(posts$pageviews))
- posts$published<-sub(" декабря в ","/12/2012 ",as.character(posts$published))
- posts$published<-sub(" ноября в ","/11/2012 ",posts$published)
- posts$published<-sub(" октября в ","/10/2012 ",posts$published)
- posts$published<-sub(" сентября в ","/09/2012 ",posts$published)
- posts$published<-sub("^ ","",posts$published)
- posts$publishedDate<-as.Date(posts$published, format="%d/%m/%Y %H:%M")
- posts$weekDay<-format(posts$publishedDate, "%A")
- return(posts)
- }
- getPostsForPages<-function(pages, cookie, sleep=0)
- {
- urls<-paste("http://habrahabr.ru/feed/posts/habred/page", pages, "/", sep="")
- ret<-data.frame()
- for(url in urls)
- {
- ret<-rbind(ret, getPosts(url, cookie))
- Sys.sleep(sleep)
- }
- return(ret)
- }
- showHabraHabrStats<-function(posts)
- {
- plot(posts[,8:10])
- plot(posts[posts$actions<100,9:11], col=posts$weekDay)
- hist(posts[posts$comments<200,"comments"], breaks=60, col=rainbow(60))
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement