daily pastebin goal

Metal Small World Test

BenjaminLind Sep 14th, 2013 90 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #This code will read in the metal band affiliation network and test its small world properties
  2.   #Distance projection types are binary, sum, and Newman
  3. #Both the data it uses and the code below were produced by Benjamin Lind under CC: BY-NC-SA 3.0
  4.   #If you use or develop this code, please provide proper attribution.
  5.   #You may contact me at lind.benjamin//at//gmail//d0t//com
  6.   #You may also find me @benjaminlind on Twitter
  8. library(RCurl)
  9. library(compiler)
  10. library(igraph)
  11. library(tnet)
  13. su<-function(x) #Sort the unique entries
  14.   return(sort(unique(x)))
  15. su<-cmpfun(su)
  17. #Function to read a data table from online
  18. read.sstab<-function(theurl, ...){
  19.   #_theurl_ refers to the location of the data
  20.   #_..._ are parameters passed onto read.table
  21.   require(RCurl)
  22.   outtab<-getURL(theurl, ssl.verifypeer=FALSE)
  23.   outtab<-textConnection(outtab)
  24.   outtab<-read.table(outtab, sep="\t", ...)
  25.   return(outtab)
  26.   }
  27. read.sstab<-cmpfun(read.sstab)
  29. #Download the most brutal data and name its parts
  30. metal.bands.df<-read.sstab("http://pastebin.com/raw.php?i=AA1SPz5K", header=TRUE, skip=4, as.is=TRUE, stringsAsFactors=FALSE, strip.white=TRUE, na.strings=c("NA", ""))
  31. colnames(metal.bands.df)[c(1,2)]<-c("group", "member")
  32. rm(read.sstab)
  34. #Some members exit a band and later reunite, a band sometimes disassembles and later reforms
  35. #For our purposes here, we're going to ignore the time dimension
  36. #The missing data kind of sucks for it, too.
  37. #We also switch the columns as members form groups
  38. non.dupes<-which(duplicated(paste(metal.bands.df$group, metal.bands.df$member, sep="*"))==FALSE)
  39. metal.bands.df<-metal.bands.df[non.dupes,c("member", "group")]
  40. metal.bands.df$member<-gsub("&amp;", "&", metal.bands.df$member, fixed=TRUE)
  41. metal.bands.df$group<-gsub("&amp;", "&", metal.bands.df$group, fixed=TRUE)
  43. #Retrieve the names for both members and groups.
  44. #Though a mighty package, tnet doesn't do labels, only integers.
  45. #_all.metal.names_ will be legend.
  46. all.metal.names<-unique(c(su(metal.bands.df$member), su(metal.bands.df$group)))
  47. metal.bands.df$member<-match(metal.bands.df$member, all.metal.names)
  48. metal.bands.df$group<-match(metal.bands.df$group, all.metal.names)
  50. #Ignore the abyss that is metal's missing data
  51. miss.rows<-which((is.na(metal.bands.df$member) | is.na(metal.bands.df$group))==TRUE)
  52. metal.bands.df<-metal.bands.df[-miss.rows,]
  53. metal.bands.tn<-list(member.group=as.tnet(metal.bands.df, type="binary two-mode tnet"), group.member=as.tnet(metal.bands.df[,c("group", "member")], type="binary two-mode tnet"))
  54. rm(non.dupes, miss.rows)
  56. #Spawn an igraph object
  57. metal.ig<-graph.data.frame(metal.bands.df[,c("member","group")], directed=FALSE)
  58. #"Alice Cooper" and other bands using only a member's full name are coded as bands, not musicians.
  59. #Some metal gods are just so epic they count as multiple people and one at the same time.
  60. V(metal.ig)$type<-V(metal.ig)$name%in%metal.bands.df$group
  61. V(metal.ig)$color<-rev(rainbow(2, s=.80, alpha=.80))[1+V(metal.ig)$type]
  62. V(metal.ig)$shape<-ifelse(V(metal.ig)$type, "square", "circle")
  63. V(metal.ig)$name<-all.metal.names[as.numeric(V(metal.ig)$name)]
  65. #Size of the network
  66. print(c(n.bands=sum(V(metal.ig)$type), n.musicians=sum(V(metal.ig)$type==FALSE)))
  68. #Who are the closest metal bands and members?
  69. bands.closeness<-sapply(c("binary", "sum", "Newman"), function(x){ a<-closeness_w(projecting_tm(metal.bands.tn$group.member, x), directed=FALSE); b<-a[,2]; names(b)<-all.metal.names[a[,1]]; return(b)})
  70. members.closeness<-sapply(c("binary", "sum", "Newman"), function(x){ a<-closeness_w(projecting_tm(metal.bands.tn$member.group, x), directed=FALSE); b<-a[,2]; names(b)<-all.metal.names[a[,1]]; return(b)})
  71. metal.closeness<-list(group=bands.closeness, member=members.closeness)
  72. rm(bands.closeness, members.closeness)
  74. #Are they in the leviathan that is the giant component?
  75. V(metal.ig)$gc<-FALSE
  76. V(metal.ig)$gc[which(V(metal.ig)$type)]<-V(metal.ig)$name[which(V(metal.ig)$type)]%in%rownames(metal.closeness$group)
  77. V(metal.ig)$gc[which(V(metal.ig)$type==FALSE)]<-V(metal.ig)$name[which(V(metal.ig)$type==FALSE)]%in%rownames(metal.closeness$member)
  79. #Beware, mighty [code] warrior.  The dimensions produced by degree_tm differ depending upon the input.  For $group.member the rows are about equal to the number of musicians and bands. For $member.group it's about equal to the number of musicians.
  80. #For this reason, I turn to the trusty igraph package
  81. metal.degree<-list(group=sort(degree(metal.ig)[which(V(metal.ig)$type)], decreasing=TRUE), member=sort(degree(metal.ig)[which(V(metal.ig)$type==FALSE)], decreasing=TRUE))
  83. print(sapply(metal.degree, mean))
  85. print(cbind(names(metal.degree$group)[1:5], metal.degree$group[1:5], names(metal.degree$member)[1:5], metal.degree$member[1:5]))
  86. #Tread carefully with this data, my dear friend.
  87. #Ghastly and decidedly un-metal bands do appear.
  88. #Earth, Wind, and Fire had the eighth most number of members.
  89. #I could purge them from the data, yet I hesitate to do so.
  90. #Who is, and is not, "metal" should come from the data.
  91. #My knowledge is limited and I cannot vouch for the metal cred of each and every one of the thousands of bands in the data.
  92. #Far more wretched and unmetal bands plague this dataset.
  93. #Hopefully the most unequivocally metal bands will triumph from their connections within the data.
  95. sw.meas<-function(a.tm.net, subsamp=1){
  96.   #_a.tm.net_ a two-mode tnet object
  97.   #_subsamp_ the proportion of cases to subsample
  98.   cc<-clustering_tm(a.tm.net, subsample=subsamp) #Clusters
  99.   ret.distpars<-function(a.dist.mat){ #Returns distance parameters
  100.     n<-nrow(a.dist.mat)
  101.     #Overwrite _a.dist.mat_ as a vector of the upper triangle, will be more efficient
  102.     a.dist.mat<-a.dist.mat[which(upper.tri(a.dist.mat))]
  103.     a.max<-max(a.dist.mat)
  104.     a.mean<-mean(a.dist.mat)
  105.     a.ci<-quantile(a.dist.mat, probs=c(.025, .975))
  106.     return(c(vcount=n, mean.geodesic=a.mean, diameter=a.max, lower.ci=a.ci[1], upper.ci=a.ci[2]))
  107.     }
  108.   dist.fun<-function(proj.meth){ #Creates a distance object and returns its parameters.
  109.     return(ret.distpars(distance_tm(a.tm.net, projection.method=proj.meth, gconly=TRUE, subsample=subsamp)))
  110.     }
  111.   geo.paths<-sapply(c("binary", "sum", "Newman"), dist.fun) #Get the distances
  112.   #My sloppy chops on the next few lines shame me.
  113.   geo.paths.vec<-c(geo.paths[,"binary"], geo.paths[,"sum"], geo.paths[,"Newman"])
  114.   cn<-c("binary", "sum", "Newman")
  115.   rn<-c("vcount", "mean.geodesic", "diameter", "lower.ci", "upper.ci")
  116.   ret.vec<-c(cc, geo.paths.vec)
  117.   names(ret.vec)<-c("clustering.coef", paste(cn[1], rn, sep="."), paste(cn[2], rn, sep="."), paste(cn[3], rn, sep="."))
  118.   return(ret.vec)
  119.   }
  120. sw.meas<-cmpfun(sw.meas)
  122. #The clustering coefficient here refers to closed four-paths
  123.   #This distinction is important, as two-mode data traditionally do not have closed three-paths.
  124.   #If two-mode networks are projected as one-mode, then their closed three-paths will be biased.
  125.   #Heretics who do so should be crucified.
  126. #For the distances we're using three means of doing so: binary, sum, and Newman
  127.   #_binary_ is just if overlapping membership exists
  128.   #_sum_ refers to the number of overlapping members
  129.   #_Newman_ divides the number of common members by the number bands each common member was/is in, minus 1.
  130.     #This method assumes that edges are less influential if created by popular vertices
  131.     #See Equation 2 in M.E.J. Newman. 2001. "Scientific collaboration networks. II. Shortest paths, weighted networks, and centrality." Physical Review E, V64, 016132
  132.     #http://pre.aps.org/abstract/PRE/v64/i1/e016132
  133.   #Interpretting the distances for both sum and Newman is relatively straightforward.
  134.     #The weights (sum or Newman) are divided by the mean, then inverted (1/(weight/average weight))
  135.     #These inverted edge weights are added when constructing paths
  136.     #The distances here refer to the number of steps (with average tie weight) the bands are from each other
  137.     #See: http://toreopsahl.com/tnet/weighted-networks/shortest-paths/
  139. obs.sw<-sw.meas(metal.bands.tn$group.member)
  140. print(obs.sw["clustering.coef"])
  141. print(obs.sw[c("binary.mean.geodesic", "sum.mean.geodesic", "Newman.mean.geodesic")])
  143. ret.tops<-function(x, x.names)
  144.   return(x.names[order(x, decreasing=TRUE)])
  145. print(cbind(head(apply(metal.closeness$group, 2, ret.tops, x.names=rownames(metal.closeness$group)), 10), head(apply(metal.closeness$member, 2, ret.tops, x.names=rownames(metal.closeness$member)), 10)))
  146. rm(ret.tops)
  148. #Tomahawk's position
  149. which(rownames(metal.closeness$group)=="Tomahawk")
  150. ecdf(metal.closeness$group[,1])(metal.closeness$group[1309,1])
  151. ecdf(metal.closeness$group[,2])(metal.closeness$group[1309,2])
  152. ecdf(metal.closeness$group[,3])(metal.closeness$group[1309,3])
  153. #Mike Patton's position
  154. which(rownames(metal.closeness$member)=="Mike Patton")
  155. ecdf(metal.closeness$member[,1])(metal.closeness$member[3418,1])
  156. ecdf(metal.closeness$member[,2])(metal.closeness$member[3418,2])
  157. ecdf(metal.closeness$member[,3])(metal.closeness$member[3418,3])
  159. #Admire the expansive ego network of Testament
  160. testament.en<-graph.neighborhood(metal.ig, order=2, nodes=which(V(metal.ig)$name=="Testament"))[[1]]
  161. testament.lo<-layout.fruchterman.reingold(testament.en, params=list(niter=2000, area=vcount(testament.en)^3))
  162. V(testament.en)$x<-testament.lo[,1]
  163. V(testament.en)$y<-testament.lo[,2]
  164. rm(testament.lo)
  166. #The high turnover in Black Sabbath's ego network reeks of "creative differences"
  167. sabbath.en<-graph.neighborhood(metal.ig, order=2, nodes=which(V(metal.ig)$name=="Black Sabbath"))[[1]]
  168. sabbath.lo<-layout.fruchterman.reingold(sabbath.en, params=list(niter=2000, area=vcount(sabbath.en)^3))
  169. V(sabbath.en)$x<-sabbath.lo[,1]
  170. V(sabbath.en)$y<-sabbath.lo[,2]
  171. rm(sabbath.lo)
  173. png("Testament.BlackSabbath.png", height=5, width=10, units="in", res=600, bg="white")
  174. par(mfrow=c(1,2))
  175. plot(testament.en, vertex.size=5, vertex.label.family="sans", vertex.label.cex=.5, vertex.label.color="black", main="Testament")
  176. plot(sabbath.en, vertex.size=5, vertex.label.family="sans", vertex.label.cex=.5, vertex.label.color="black", main="Black Sabbath")
  177. dev.off()
  179. #T-test of the member degrees
  180. t.test(degree(testament.en)[which(!V(testament.en)$type)]-1, degree(sabbath.en)[which(!V(sabbath.en)$type)]-1, "greater")
  181. #Bands tied to Testament
  182. length(V(testament.en)$name[which(V(testament.en)$type)])-1
  183. #Bands tied to Black Sabbath
  184. length(V(sabbath.en)$name[which(V(sabbath.en)$type)])-1
  186. mean((degree(testament.en)[which(!V(testament.en)$type)]-1)==0)
  187. mean((degree(sabbath.en)[which(!V(sabbath.en)$type)]-1)==0)
  189. #A theoretically poor way to measure random average shortest path-length and clustering coefficient
  190. #Based off of the one-mode projections
  191. print(c(Path.Length.Random=log(length(which(V(metal.ig)$type&V(metal.ig)$gc)))/log(mean(degree(metal.ig)[which(V(metal.ig)$type&V(metal.ig)$gc)])), Clustering.Coef.Random=mean(degree(metal.ig)[which(V(metal.ig)$type)])/length(which(V(metal.ig)$type))))
  192. #Though the theory behind it sucks, the results aren't bad!
  194. #The CUG tests here are based off of reshuffled links
  195.   #This holds the number of members in each band and the number of bands each member joins to be constant
  196.   #Controlling for degree effects, it assumes that any identified person can join any band
  197.   #Admittedly, this assumption isn't completely realistic, but data on controls are limited
  198.   #For random graph tests, see
  199.     #http://www.sciencedirect.com/science/article/pii/S0378873399000118
  200.     #http://toreopsahl.com/tnet/two-mode-networks/random-networks/  
  202. #Brutual computational work ahead.
  203. #This next command took me four unholy hours to complete.
  204. exp.sw<-replicate(1000, sw.meas(rg_reshuffling_tm(metal.bands.tn$group.member)))
  205. exp.sw<-as.data.frame(t(exp.sw))
  206. save.image("SW.Test.Metal.RData")
  208. #For the L_actual/L_random as reported by Watts (1999:516)
  209. obs.sw[c("binary.mean.geodesic","sum.mean.geodesic","Newman.mean.geodesic")]/apply(exp.sw[,c("binary.mean.geodesic","sum.mean.geodesic","Newman.mean.geodesic")],2, mean)
  210. nrow(metal.closeness$group)
  211. quantile(c(exp.sw$binary.vcount, exp.sw$sum.vcount, exp.sw$Newman.vcount), probs=c(.025, .975))
  213. #Leaving out the code that plots the histograms.
RAW Paste Data