daily pastebin goal

Metal Small World Test

BenjaminLind Sep 14th, 2013 93 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #This code will read in the metal band affiliation network and test its small world properties
  2.   #Distance projection types are binary, sum, and Newman
  3. #Both the data it uses and the code below were produced by Benjamin Lind under CC: BY-NC-SA 3.0
  4.   #If you use or develop this code, please provide proper attribution.
  5.   #You may contact me at lind.benjamin//at//gmail//d0t//com
  6.   #You may also find me @benjaminlind on Twitter
  8. library(RCurl)
  9. library(compiler)
  10. library(igraph)
  11. library(tnet)
  13. su<-function(x) #Sort the unique entries
  14.   return(sort(unique(x)))
  15. su<-cmpfun(su)
  17. #Function to read a data table from online
  18. read.sstab<-function(theurl, ...){
  19.   #_theurl_ refers to the location of the data
  20.   #_..._ are parameters passed onto read.table
  21.   require(RCurl)
  22.   outtab<-getURL(theurl, ssl.verifypeer=FALSE)
  23.   outtab<-textConnection(outtab)
  24.   outtab<-read.table(outtab, sep="\t", ...)
  25.   return(outtab)
  26.   }
  27. read.sstab<-cmpfun(read.sstab)
  29. #Download the most brutal data and name its parts
  30. metal.bands.df<-read.sstab("http://pastebin.com/raw.php?i=AA1SPz5K", header=TRUE, skip=4, as.is=TRUE, stringsAsFactors=FALSE, strip.white=TRUE, na.strings=c("NA", ""))
  31. colnames(metal.bands.df)[c(1,2)]<-c("group", "member")
  32. rm(read.sstab)
  34. #Some members exit a band and later reunite, a band sometimes disassembles and later reforms
  35. #For our purposes here, we're going to ignore the time dimension
  36. #The missing data kind of sucks for it, too.
  37. #We also switch the columns as members form groups
  38. non.dupes<-which(duplicated(paste(metal.bands.df$group, metal.bands.df$member, sep="*"))==FALSE)
  39. metal.bands.df<-metal.bands.df[non.dupes,c("member", "group")]
  40. metal.bands.df$member<-gsub("&amp;", "&", metal.bands.df$member, fixed=TRUE)
  41. metal.bands.df$group<-gsub("&amp;", "&", metal.bands.df$group, fixed=TRUE)
  43. #Retrieve the names for both members and groups.
  44. #Though a mighty package, tnet doesn't do labels, only integers.
  45. #_all.metal.names_ will be legend.
  46. all.metal.names<-unique(c(su(metal.bands.df$member), su(metal.bands.df$group)))
  47. metal.bands.df$member<-match(metal.bands.df$member, all.metal.names)
  48. metal.bands.df$group<-match(metal.bands.df$group, all.metal.names)
  50. #Ignore the abyss that is metal's missing data
  51. miss.rows<-which((is.na(metal.bands.df$member) | is.na(metal.bands.df$group))==TRUE)
  52. metal.bands.df<-metal.bands.df[-miss.rows,]
  53. metal.bands.tn<-list(member.group=as.tnet(metal.bands.df, type="binary two-mode tnet"), group.member=as.tnet(metal.bands.df[,c("group", "member")], type="binary two-mode tnet"))
  54. rm(non.dupes, miss.rows)
  56. #Spawn an igraph object
  57. metal.ig<-graph.data.frame(metal.bands.df[,c("member","group")], directed=FALSE)
  58. #"Alice Cooper" and other bands using only a member's full name are coded as bands, not musicians.
  59. #Some metal gods are just so epic they count as multiple people and one at the same time.
  60. V(metal.ig)$type<-V(metal.ig)$name%in%metal.bands.df$group
  61. V(metal.ig)$color<-rev(rainbow(2, s=.80, alpha=.80))[1+V(metal.ig)$type]
  62. V(metal.ig)$shape<-ifelse(V(metal.ig)$type, "square", "circle")
  63. V(metal.ig)$name<-all.metal.names[as.numeric(V(metal.ig)$name)]
  65. #Size of the network
  66. print(c(n.bands=sum(V(metal.ig)$type), n.musicians=sum(V(metal.ig)$type==FALSE)))
  68. #Who are the closest metal bands and members?
  69. bands.closeness<-sapply(c("binary", "sum", "Newman"), function(x){ a<-closeness_w(projecting_tm(metal.bands.tn$group.member, x), directed=FALSE); b<-a[,2]; names(b)<-all.metal.names[a[,1]]; return(b)})
  70. members.closeness<-sapply(c("binary", "sum", "Newman"), function(x){ a<-closeness_w(projecting_tm(metal.bands.tn$member.group, x), directed=FALSE); b<-a[,2]; names(b)<-all.metal.names[a[,1]]; return(b)})
  71. metal.closeness<-list(group=bands.closeness, member=members.closeness)
  72. rm(bands.closeness, members.closeness)
  74. #Are they in the leviathan that is the giant component?
  75. V(metal.ig)$gc<-FALSE
  76. V(metal.ig)$gc[which(V(metal.ig)$type)]<-V(metal.ig)$name[which(V(metal.ig)$type)]%in%rownames(metal.closeness$group)
  77. V(metal.ig)$gc[which(V(metal.ig)$type==FALSE)]<-V(metal.ig)$name[which(V(metal.ig)$type==FALSE)]%in%rownames(metal.closeness$member)
  79. #Beware, mighty [code] warrior.  The dimensions produced by degree_tm differ depending upon the input.  For $group.member the rows are about equal to the number of musicians and bands. For $member.group it's about equal to the number of musicians.
  80. #For this reason, I turn to the trusty igraph package
  81. metal.degree<-list(group=sort(degree(metal.ig)[which(V(metal.ig)$type)], decreasing=TRUE), member=sort(degree(metal.ig)[which(V(metal.ig)$type==FALSE)], decreasing=TRUE))
  83. print(sapply(metal.degree, mean))
  85. print(cbind(names(metal.degree$group)[1:5], metal.degree$group[1:5], names(metal.degree$member)[1:5], metal.degree$member[1:5]))
  86. #Tread carefully with this data, my dear friend.
  87. #Ghastly and decidedly un-metal bands do appear.
  88. #Earth, Wind, and Fire had the eighth most number of members.
  89. #I could purge them from the data, yet I hesitate to do so.
  90. #Who is, and is not, "metal" should come from the data.
  91. #My knowledge is limited and I cannot vouch for the metal cred of each and every one of the thousands of bands in the data.
  92. #Far more wretched and unmetal bands plague this dataset.
  93. #Hopefully the most unequivocally metal bands will triumph from their connections within the data.
  95. sw.meas<-function(a.tm.net, subsamp=1){
  96.   #_a.tm.net_ a two-mode tnet object
  97.   #_subsamp_ the proportion of cases to subsample
  98.   cc<-clustering_tm(a.tm.net, subsample=subsamp) #Clusters
  99.   ret.distpars<-function(a.dist.mat){ #Returns distance parameters
  100.     n<-nrow(a.dist.mat)
  101.     #Overwrite _a.dist.mat_ as a vector of the upper triangle, will be more efficient
  102.     a.dist.mat<-a.dist.mat[which(upper.tri(a.dist.mat))]
  103.     a.max<-max(a.dist.mat)
  104.     a.mean<-mean(a.dist.mat)
  105.     a.ci<-quantile(a.dist.mat, probs=c(.025, .975))
  106.     return(c(vcount=n, mean.geodesic=a.mean, diameter=a.max, lower.ci=a.ci[1], upper.ci=a.ci[2]))
  107.     }
  108.   dist.fun<-function(proj.meth){ #Creates a distance object and returns its parameters.
  109.     return(ret.distpars(distance_tm(a.tm.net, projection.method=proj.meth, gconly=TRUE, subsample=subsamp)))
  110.     }
  111.   geo.paths<-sapply(c("binary", "sum", "Newman"), dist.fun) #Get the distances
  112.   #My sloppy chops on the next few lines shame me.
  113.   geo.paths.vec<-c(geo.paths[,"binary"], geo.paths[,"sum"], geo.paths[,"Newman"])
  114.   cn<-c("binary", "sum", "Newman")
  115.   rn<-c("vcount", "mean.geodesic", "diameter", "lower.ci", "upper.ci")
  116.   ret.vec<-c(cc, geo.paths.vec)
  117.   names(ret.vec)<-c("clustering.coef", paste(cn[1], rn, sep="."), paste(cn[2], rn, sep="."), paste(cn[3], rn, sep="."))
  118.   return(ret.vec)
  119.   }
  120. sw.meas<-cmpfun(sw.meas)
  122. #The clustering coefficient here refers to closed four-paths
  123.   #This distinction is important, as two-mode data traditionally do not have closed three-paths.
  124.   #If two-mode networks are projected as one-mode, then their closed three-paths will be biased.
  125.   #Heretics who do so should be crucified.
  126. #For the distances we're using three means of doing so: binary, sum, and Newman
  127.   #_binary_ is just if overlapping membership exists
  128.   #_sum_ refers to the number of overlapping members
  129.   #_Newman_ divides the number of common members by the number bands each common member was/is in, minus 1.
  130.     #This method assumes that edges are less influential if created by popular vertices
  131.     #See Equation 2 in M.E.J. Newman. 2001. "Scientific collaboration networks. II. Shortest paths, weighted networks, and centrality." Physical Review E, V64, 016132
  132.     #http://pre.aps.org/abstract/PRE/v64/i1/e016132
  133.   #Interpretting the distances for both sum and Newman is relatively straightforward.
  134.     #The weights (sum or Newman) are divided by the mean, then inverted (1/(weight/average weight))
  135.     #These inverted edge weights are added when constructing paths
  136.     #The distances here refer to the number of steps (with average tie weight) the bands are from each other
  137.     #See: http://toreopsahl.com/tnet/weighted-networks/shortest-paths/
  139. obs.sw<-sw.meas(metal.bands.tn$group.member)
  140. print(obs.sw["clustering.coef"])
  141. print(obs.sw[c("binary.mean.geodesic", "sum.mean.geodesic", "Newman.mean.geodesic")])
  143. ret.tops<-function(x, x.names)
  144.   return(x.names[order(x, decreasing=TRUE)])
  145. print(cbind(head(apply(metal.closeness$group, 2, ret.tops, x.names=rownames(metal.closeness$group)), 10), head(apply(metal.closeness$member, 2, ret.tops, x.names=rownames(metal.closeness$member)), 10)))
  146. rm(ret.tops)
  148. #Tomahawk's position
  149. which(rownames(metal.closeness$group)=="Tomahawk")
  150. ecdf(metal.closeness$group[,1])(metal.closeness$group[1309,1])
  151. ecdf(metal.closeness$group[,2])(metal.closeness$group[1309,2])
  152. ecdf(metal.closeness$group[,3])(metal.closeness$group[1309,3])
  153. #Mike Patton's position
  154. which(rownames(metal.closeness$member)=="Mike Patton")
  155. ecdf(metal.closeness$member[,1])(metal.closeness$member[3418,1])
  156. ecdf(metal.closeness$member[,2])(metal.closeness$member[3418,2])
  157. ecdf(metal.closeness$member[,3])(metal.closeness$member[3418,3])
  159. #Admire the expansive ego network of Testament
  160. testament.en<-graph.neighborhood(metal.ig, order=2, nodes=which(V(metal.ig)$name=="Testament"))[[1]]
  161. testament.lo<-layout.fruchterman.reingold(testament.en, params=list(niter=2000, area=vcount(testament.en)^3))
  162. V(testament.en)$x<-testament.lo[,1]
  163. V(testament.en)$y<-testament.lo[,2]
  164. rm(testament.lo)
  166. #The high turnover in Black Sabbath's ego network reeks of "creative differences"
  167. sabbath.en<-graph.neighborhood(metal.ig, order=2, nodes=which(V(metal.ig)$name=="Black Sabbath"))[[1]]
  168. sabbath.lo<-layout.fruchterman.reingold(sabbath.en, params=list(niter=2000, area=vcount(sabbath.en)^3))
  169. V(sabbath.en)$x<-sabbath.lo[,1]
  170. V(sabbath.en)$y<-sabbath.lo[,2]
  171. rm(sabbath.lo)
  173. png("Testament.BlackSabbath.png", height=5, width=10, units="in", res=600, bg="white")
  174. par(mfrow=c(1,2))
  175. plot(testament.en, vertex.size=5, vertex.label.family="sans", vertex.label.cex=.5, vertex.label.color="black", main="Testament")
  176. plot(sabbath.en, vertex.size=5, vertex.label.family="sans", vertex.label.cex=.5, vertex.label.color="black", main="Black Sabbath")
  177. dev.off()
  179. #T-test of the member degrees
  180. t.test(degree(testament.en)[which(!V(testament.en)$type)]-1, degree(sabbath.en)[which(!V(sabbath.en)$type)]-1, "greater")
  181. #Bands tied to Testament
  182. length(V(testament.en)$name[which(V(testament.en)$type)])-1
  183. #Bands tied to Black Sabbath
  184. length(V(sabbath.en)$name[which(V(sabbath.en)$type)])-1
  186. mean((degree(testament.en)[which(!V(testament.en)$type)]-1)==0)
  187. mean((degree(sabbath.en)[which(!V(sabbath.en)$type)]-1)==0)
  189. #A theoretically poor way to measure random average shortest path-length and clustering coefficient
  190. #Based off of the one-mode projections
  191. print(c(Path.Length.Random=log(length(which(V(metal.ig)$type&V(metal.ig)$gc)))/log(mean(degree(metal.ig)[which(V(metal.ig)$type&V(metal.ig)$gc)])), Clustering.Coef.Random=mean(degree(metal.ig)[which(V(metal.ig)$type)])/length(which(V(metal.ig)$type))))
  192. #Though the theory behind it sucks, the results aren't bad!
  194. #The CUG tests here are based off of reshuffled links
  195.   #This holds the number of members in each band and the number of bands each member joins to be constant
  196.   #Controlling for degree effects, it assumes that any identified person can join any band
  197.   #Admittedly, this assumption isn't completely realistic, but data on controls are limited
  198.   #For random graph tests, see
  199.     #http://www.sciencedirect.com/science/article/pii/S0378873399000118
  200.     #http://toreopsahl.com/tnet/two-mode-networks/random-networks/  
  202. #Brutual computational work ahead.
  203. #This next command took me four unholy hours to complete.
  204. exp.sw<-replicate(1000, sw.meas(rg_reshuffling_tm(metal.bands.tn$group.member)))
  205. exp.sw<-as.data.frame(t(exp.sw))
  206. save.image("SW.Test.Metal.RData")
  208. #For the L_actual/L_random as reported by Watts (1999:516)
  209. obs.sw[c("binary.mean.geodesic","sum.mean.geodesic","Newman.mean.geodesic")]/apply(exp.sw[,c("binary.mean.geodesic","sum.mean.geodesic","Newman.mean.geodesic")],2, mean)
  210. nrow(metal.closeness$group)
  211. quantile(c(exp.sw$binary.vcount, exp.sw$sum.vcount, exp.sw$Newman.vcount), probs=c(.025, .975))
  213. #Leaving out the code that plots the histograms.
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand