Advertisement
mjaniec

nju mobile data analysis

Dec 23rd, 2014
341
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 7.74 KB | None | 0 0
  1. # visit: http://www.reakkt.com/2014/12/when-marginal-price-of-call-goes-to-zero.html
  2.  
  3. library(MASS)
  4. library(plotrix)
  5. library(Hmisc) # http://stackoverflow.com/questions/6243088/find-out-the-number-of-days-of-a-month-in-r
  6.  
  7. ###
  8.  
  9. # http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r
  10. trim <- function( x ) {
  11.   gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)
  12. }
  13.  
  14. EventIdx <- function(event_name) {
  15.  
  16.   which(trim(as.character(njudata[,"event"]))==event_name)
  17.  
  18. }
  19.  
  20. TimeStrNum <- function(x) {
  21.  
  22.   as.numeric(substr(x,1,2))+as.numeric(substr(x,4,5))/60
  23.  
  24. }
  25.  
  26. CallLength <- function(x) {
  27.  
  28.   matched <- regexpr("[0-9]{1,2}:[0-9]{1,2}",x,perl=TRUE)
  29.  
  30.   s0    <- matched[1]
  31.  
  32.   s1    <- s0+attr(matched,"match.length")-1
  33.  
  34.   hhmm  <- substr(x,s0,s1)
  35.  
  36.   len   <- nchar(hhmm)
  37.  
  38.   as.numeric(substr(hhmm,1,len-3))+as.numeric(substr(hhmm,len-1,len))/60
  39.  
  40. }
  41.  
  42. ClearNumber <- function(x) {
  43.  
  44.   x <- gsub("\\+","",x)
  45.  
  46.   x <- gsub(" ","",x)
  47.  
  48.   return(x)
  49.  
  50. }
  51.  
  52. EncodeNumber <- function(x,p) {
  53.  
  54.   tmp <- NULL
  55.  
  56.   for (i in rev(1:p)) {
  57.    
  58.     d <- x %/% 26^(i-1)
  59.    
  60.     tmp <- paste(tmp,LETTERS[d],sep="")
  61.    
  62.     x <- x - d*26^(i-1)
  63.    
  64.   }
  65.  
  66.   return( tmp )
  67.  
  68.  
  69. }
  70.  
  71. DataLoad <- function(x) {
  72.  
  73.   len <- nchar(x)
  74.  
  75.   as.numeric(substr(x,1,len-4))
  76.  
  77. }
  78.  
  79. ###
  80.  
  81. njudata <- read.csv("nju mobile data.csv",header=TRUE,sep=";")
  82.  
  83. colnames(njudata) <- c("time","phone","qty","event","operator","cost")
  84.  
  85. head(njudata)
  86.  
  87. unique(njudata[,"event"])
  88.  
  89. ###
  90.  
  91. voiceIdx <- EventIdx("Rozmowa glosowa")
  92. smsIdx   <- EventIdx("Wiadomosc SMS")
  93. dataIdx  <- EventIdx("Dane")
  94.  
  95. ### voice
  96.  
  97. length(voiceIdx)
  98.  
  99. timeStr <- strptime(as.character(njudata[voiceIdx,"time"]),format="%Y.%m.%d %H:%M")
  100.  
  101. voice.days <- substr(timeStr,1,10)
  102. voice.time <- substr(timeStr,12,16)
  103.  
  104. voice.yymm <- substr(voice.days,1,7)
  105.  
  106. voice.hour <- substr(voice.time,1,2)
  107.  
  108. voice.time.numeric <- sapply(voice.time, function(x) TimeStrNum(x))
  109.  
  110. summary(voice.time.numeric)
  111.  
  112. plot(density(voice.time.numeric),xlim=c(0,24),
  113.      main="Voice Call Init. distribution",
  114.      xlab="hour of a day")
  115. abline(v=min(voice.time.numeric),col="Red")
  116. abline(v=max(voice.time.numeric),col="Red")
  117.  
  118. voice.call.length <- CallLength(as.character(njudata[voiceIdx,"qty"]))
  119.  
  120. sum(voice.call.length)
  121.  
  122. sum(voice.call.length)/as.numeric(difftime(tail(timeStr,1),timeStr[1],units="days"))
  123.  
  124. plot(voice.call.length,type="h")
  125.  
  126. plot(cumsum(voice.call.length),type="l")
  127. abline(lm( (y=cumsum(voice.call.length)) ~ (x=1:length(voice.call.length)) ) , col="Blue")
  128.  
  129. summary(voice.call.length)
  130.  
  131. plot(density(voice.call.length),main="Voice Call Length [min]",xlab="minutes")
  132.  
  133. boxplot(voice.call.length~factor(weekdays(timeStr,abbreviate=TRUE),c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),
  134.         main="Voice call lengths by weekdays",
  135.         ylab="minutes")
  136.  
  137. ### called
  138.  
  139. called <- as.character(njudata[voiceIdx,"phone"])
  140.  
  141. called <- sapply(called, function(x) ClearNumber(x))
  142.  
  143. calledNo <- length(unique(called))
  144.  
  145. required.letters <- ceiling(log(calledNo,26))
  146.  
  147. called.code <- sample(27:26^required.letters, calledNo, replace=FALSE)
  148.  
  149. called.symb <- sapply(called.code, function(x) EncodeNumber(x,required.letters))
  150.  
  151. called.coded <- rep("",length(called))
  152. for (i in 1:calledNo) called.coded[ which(called==unique(called)[i]) ] <- called.symb[i]
  153.  
  154. called.stat <- matrix(0,calledNo,4)
  155. colnames(called.stat) <- c("qty","len","avg","sd")
  156. rownames(called.stat) <- called.symb
  157.  
  158. for (i in 1:length(called)) {
  159.  
  160.   calledId <- which(called.symb==called.coded[i])
  161.  
  162.   called.stat[calledId,"qty"] <- called.stat[calledId,"qty"]+1
  163.   called.stat[calledId,"len"] <- called.stat[calledId,"len"]+voice.call.length[i]
  164.  
  165. }
  166.  
  167. for (i in 1:nrow(called.stat)) {
  168.  
  169.   idx <- which(called.coded==called.symb[i])
  170.  
  171.   called.stat[i,"avg"] <- called.stat[i,"len"]/called.stat[i,"qty"]
  172.   called.stat[i,"sd"]  <- sd(voice.call.length[idx])
  173.  
  174. }
  175.  
  176. par(mfrow=c(2,1))
  177. plot(sort(called.stat[,"qty"]),type="h",
  178.      ylab="Number of calls to a number")
  179. plot(sort(called.stat[,"len"]),type="h",
  180.      ylab="Number of minutes to a number")
  181. par(mfrow=c(1,1))
  182.  
  183. boxplot(voice.call.length~called.coded,
  184.         main="Call characteristic for different recipients",
  185.         xlab="called",ylab="call length in minutes")
  186.  
  187. plot(called.stat[,"qty"],called.stat[,"avg"],pch=16,
  188.      main="Number of calls vs Time of call (to a number)",
  189.      xlab="number of calls",ylab="average call length [min]")
  190. abline(lm(called.stat[,"avg"]~called.stat[,"qty"]),col="Blue")
  191. abline(h=mean(voice.call.length),col="Red",lty="dashed")
  192. lines(called.stat[,"qty"],called.stat[,"sd"],type="h",col="Green")
  193.  
  194. plot(called.stat[,"qty"],called.stat[,"sd"],log="x")
  195.  
  196. nrow(njudata[voiceIdx,])
  197.  
  198. calledNo
  199.  
  200. ### data
  201.  
  202. timeStr <- strptime(as.character(njudata[dataIdx,"time"]),format="%Y.%m.%d %H:%M")
  203.  
  204. data.days <- substr(timeStr,1,10)
  205. data.time <- substr(timeStr,12,16)
  206.  
  207. data.yymm <- substr(data.days,1,7)
  208.  
  209. data.hour <- substr(data.time,1,2)
  210.  
  211. transmitted <- sapply(as.character(njudata[dataIdx,"qty"]), function(x) DataLoad(x))
  212.  
  213. plot(transmitted/1000,type="h",
  214.      main="Data transmission process",
  215.      xlab="time [minutes]",ylab="MB")
  216.  
  217. boxplot((transmitted/1000)~data.hour,
  218.         main="Data transmission per hour",
  219.         log="y",ylab="MB (log)",xlab="hour of a day")
  220.  
  221. plot(cumsum(transmitted/1000),type="l",
  222.      main="Cummulative data transmission",
  223.      xlab="time [minutes]",ylab="MB")
  224. abline(lm((y=cumsum(transmitted/1000))~(x=1:length(transmitted))),col="Blue")
  225.  
  226. sum(transmitted)/1000
  227. difftime(tail(timeStr,1),timeStr[1])
  228.  
  229. transmission.day <- sum(transmitted)/1000/as.numeric(difftime(tail(timeStr,1),timeStr[1],units="days"))
  230.  
  231. transmission.day
  232. transmission.day*30
  233.  
  234. ### voice + data
  235.  
  236. # hourly
  237. comb.stats <- matrix(0,24,2)
  238. colnames(comb.stats) <- c("voice","data")
  239.  
  240. for (i in 1:24) {
  241.  
  242.   idx.voice <- which((as.numeric(voice.hour)+1)==i)
  243.   idx.data  <- which((as.numeric(data.hour)+1)==i)
  244.  
  245.   if (length(idx.voice)>0)  comb.stats[i,"voice"] <- mean(voice.call.length[idx.voice])
  246.  
  247.   if (length(idx.data)>0)   comb.stats[i,"data"]  <- mean(transmitted[idx.data]/1000)
  248.  
  249. }
  250.  
  251. comb.stats[,"voice"] <-  rescale(comb.stats[,"voice"],c(0,1))
  252. comb.stats[,"data"]  <- rescale(comb.stats[,"data"],c(0,1))
  253.  
  254. matplot(comb.stats,type="l",lty="solid",
  255.         main="Voice vs. Data",
  256.         xlab="hour of a day",ylab="average intensity %")
  257. legend("topright",c("voice","data"),text.col=c("Black","Red"),bty="n",bg="n")
  258.  
  259.  
  260. # monthly
  261. all.months <- sort(unique(union(voice.yymm,data.yymm)))
  262.  
  263. month.stats <- matrix(0,length(all.months),2)
  264. colnames(month.stats) <- c("voice","data")
  265. rownames(month.stats) <- all.months
  266.  
  267. timeStr.all <- strptime(as.character(njudata[,"time"]),format="%Y.%m.%d %H:%M")
  268.  
  269. dayB <- head(timeStr.all,1) # begining
  270. dayE <- tail(timeStr.all,1) # end
  271.  
  272. for (this.month in all.months) {
  273.  
  274.   month.idx <- which(all.months==this.month)
  275.  
  276.   daysN <- monthDays(as.Date(paste(this.month,"-01",sep="")))
  277.  
  278.   if (month.idx==1)                   daysN <- daysN-as.numeric(substr(dayB,9,10))+1
  279.    
  280.   if (month.idx==length(all.months))  daysN <- as.numeric(substr(dayE,9,10))
  281.    
  282.   voice.idx <- which(voice.yymm==this.month)
  283.  
  284.   data.idx  <- which(data.yymm==this.month)
  285.  
  286.   month.stats[month.idx,"voice"] <- sum(voice.call.length[voice.idx])/daysN
  287.   month.stats[month.idx,"data"]  <- sum(transmitted[data.idx])/1000/daysN
  288.  
  289. }
  290.  
  291. month.stats
  292.  
  293. # month.stats[,"voice"] <- rescale(month.stats[,"voice"],c(0,1))
  294. # month.stats[,"data"]  <- rescale(month.stats[,"data"],c(0,1))
  295.  
  296. matplot(month.stats,type="l",lty="solid",
  297.         ylab="average daily usage",xaxt="n")
  298. axis(1,1:7,all.months)
  299. legend("topright",c("voice","data"),text.col=c("Black","Red"),bty="n",bg="n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement