Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #This code will read in the metal band affiliation network and test its small world properties
- #Distance projection types are binary, sum, and Newman
- #Both the data it uses and the code below were produced by Benjamin Lind under CC: BY-NC-SA 3.0
- #If you use or develop this code, please provide proper attribution.
- #You may contact me at lind.benjamin//at//gmail//d0t//com
- #You may also find me @benjaminlind on Twitter
- library(RCurl)
- library(compiler)
- library(igraph)
- library(tnet)
- su<-function(x) #Sort the unique entries
- return(sort(unique(x)))
- su<-cmpfun(su)
- #Function to read a data table from online
- read.sstab<-function(theurl, ...){
- #_theurl_ refers to the location of the data
- #_..._ are parameters passed onto read.table
- require(RCurl)
- outtab<-getURL(theurl, ssl.verifypeer=FALSE)
- outtab<-textConnection(outtab)
- outtab<-read.table(outtab, sep="\t", ...)
- return(outtab)
- }
- read.sstab<-cmpfun(read.sstab)
- #Download the most brutal data and name its parts
- metal.bands.df<-read.sstab("http://pastebin.com/raw.php?i=AA1SPz5K", header=TRUE, skip=4, as.is=TRUE, stringsAsFactors=FALSE, strip.white=TRUE, na.strings=c("NA", ""))
- colnames(metal.bands.df)[c(1,2)]<-c("group", "member")
- rm(read.sstab)
- #Some members exit a band and later reunite, a band sometimes disassembles and later reforms
- #For our purposes here, we're going to ignore the time dimension
- #The missing data kind of sucks for it, too.
- #We also switch the columns as members form groups
- non.dupes<-which(duplicated(paste(metal.bands.df$group, metal.bands.df$member, sep="*"))==FALSE)
- metal.bands.df<-metal.bands.df[non.dupes,c("member", "group")]
- metal.bands.df$member<-gsub("&", "&", metal.bands.df$member, fixed=TRUE)
- metal.bands.df$group<-gsub("&", "&", metal.bands.df$group, fixed=TRUE)
- #Retrieve the names for both members and groups.
- #Though a mighty package, tnet doesn't do labels, only integers.
- #_all.metal.names_ will be legend.
- all.metal.names<-unique(c(su(metal.bands.df$member), su(metal.bands.df$group)))
- metal.bands.df$member<-match(metal.bands.df$member, all.metal.names)
- metal.bands.df$group<-match(metal.bands.df$group, all.metal.names)
- #Ignore the abyss that is metal's missing data
- miss.rows<-which((is.na(metal.bands.df$member) | is.na(metal.bands.df$group))==TRUE)
- metal.bands.df<-metal.bands.df[-miss.rows,]
- metal.bands.tn<-list(member.group=as.tnet(metal.bands.df, type="binary two-mode tnet"), group.member=as.tnet(metal.bands.df[,c("group", "member")], type="binary two-mode tnet"))
- rm(non.dupes, miss.rows)
- #Spawn an igraph object
- metal.ig<-graph.data.frame(metal.bands.df[,c("member","group")], directed=FALSE)
- #"Alice Cooper" and other bands using only a member's full name are coded as bands, not musicians.
- #Some metal gods are just so epic they count as multiple people and one at the same time.
- V(metal.ig)$type<-V(metal.ig)$name%in%metal.bands.df$group
- V(metal.ig)$color<-rev(rainbow(2, s=.80, alpha=.80))[1+V(metal.ig)$type]
- V(metal.ig)$shape<-ifelse(V(metal.ig)$type, "square", "circle")
- V(metal.ig)$name<-all.metal.names[as.numeric(V(metal.ig)$name)]
- #Size of the network
- print(c(n.bands=sum(V(metal.ig)$type), n.musicians=sum(V(metal.ig)$type==FALSE)))
- #Who are the closest metal bands and members?
- bands.closeness<-sapply(c("binary", "sum", "Newman"), function(x){ a<-closeness_w(projecting_tm(metal.bands.tn$group.member, x), directed=FALSE); b<-a[,2]; names(b)<-all.metal.names[a[,1]]; return(b)})
- members.closeness<-sapply(c("binary", "sum", "Newman"), function(x){ a<-closeness_w(projecting_tm(metal.bands.tn$member.group, x), directed=FALSE); b<-a[,2]; names(b)<-all.metal.names[a[,1]]; return(b)})
- metal.closeness<-list(group=bands.closeness, member=members.closeness)
- rm(bands.closeness, members.closeness)
- #Are they in the leviathan that is the giant component?
- V(metal.ig)$gc<-FALSE
- V(metal.ig)$gc[which(V(metal.ig)$type)]<-V(metal.ig)$name[which(V(metal.ig)$type)]%in%rownames(metal.closeness$group)
- V(metal.ig)$gc[which(V(metal.ig)$type==FALSE)]<-V(metal.ig)$name[which(V(metal.ig)$type==FALSE)]%in%rownames(metal.closeness$member)
- #Beware, mighty [code] warrior. The dimensions produced by degree_tm differ depending upon the input. For $group.member the rows are about equal to the number of musicians and bands. For $member.group it's about equal to the number of musicians.
- #For this reason, I turn to the trusty igraph package
- metal.degree<-list(group=sort(degree(metal.ig)[which(V(metal.ig)$type)], decreasing=TRUE), member=sort(degree(metal.ig)[which(V(metal.ig)$type==FALSE)], decreasing=TRUE))
- print(sapply(metal.degree, mean))
- print(cbind(names(metal.degree$group)[1:5], metal.degree$group[1:5], names(metal.degree$member)[1:5], metal.degree$member[1:5]))
- #Tread carefully with this data, my dear friend.
- #Ghastly and decidedly un-metal bands do appear.
- #Earth, Wind, and Fire had the eighth most number of members.
- #I could purge them from the data, yet I hesitate to do so.
- #Who is, and is not, "metal" should come from the data.
- #My knowledge is limited and I cannot vouch for the metal cred of each and every one of the thousands of bands in the data.
- #Far more wretched and unmetal bands plague this dataset.
- #Hopefully the most unequivocally metal bands will triumph from their connections within the data.
- sw.meas<-function(a.tm.net, subsamp=1){
- #_a.tm.net_ a two-mode tnet object
- #_subsamp_ the proportion of cases to subsample
- cc<-clustering_tm(a.tm.net, subsample=subsamp) #Clusters
- ret.distpars<-function(a.dist.mat){ #Returns distance parameters
- n<-nrow(a.dist.mat)
- #Overwrite _a.dist.mat_ as a vector of the upper triangle, will be more efficient
- a.dist.mat<-a.dist.mat[which(upper.tri(a.dist.mat))]
- a.max<-max(a.dist.mat)
- a.mean<-mean(a.dist.mat)
- a.ci<-quantile(a.dist.mat, probs=c(.025, .975))
- return(c(vcount=n, mean.geodesic=a.mean, diameter=a.max, lower.ci=a.ci[1], upper.ci=a.ci[2]))
- }
- dist.fun<-function(proj.meth){ #Creates a distance object and returns its parameters.
- return(ret.distpars(distance_tm(a.tm.net, projection.method=proj.meth, gconly=TRUE, subsample=subsamp)))
- }
- geo.paths<-sapply(c("binary", "sum", "Newman"), dist.fun) #Get the distances
- #My sloppy chops on the next few lines shame me.
- geo.paths.vec<-c(geo.paths[,"binary"], geo.paths[,"sum"], geo.paths[,"Newman"])
- cn<-c("binary", "sum", "Newman")
- rn<-c("vcount", "mean.geodesic", "diameter", "lower.ci", "upper.ci")
- ret.vec<-c(cc, geo.paths.vec)
- names(ret.vec)<-c("clustering.coef", paste(cn[1], rn, sep="."), paste(cn[2], rn, sep="."), paste(cn[3], rn, sep="."))
- return(ret.vec)
- }
- sw.meas<-cmpfun(sw.meas)
- #The clustering coefficient here refers to closed four-paths
- #This distinction is important, as two-mode data traditionally do not have closed three-paths.
- #If two-mode networks are projected as one-mode, then their closed three-paths will be biased.
- #Heretics who do so should be crucified.
- #For the distances we're using three means of doing so: binary, sum, and Newman
- #_binary_ is just if overlapping membership exists
- #_sum_ refers to the number of overlapping members
- #_Newman_ divides the number of common members by the number bands each common member was/is in, minus 1.
- #This method assumes that edges are less influential if created by popular vertices
- #See Equation 2 in M.E.J. Newman. 2001. "Scientific collaboration networks. II. Shortest paths, weighted networks, and centrality." Physical Review E, V64, 016132
- #http://pre.aps.org/abstract/PRE/v64/i1/e016132
- #Interpretting the distances for both sum and Newman is relatively straightforward.
- #The weights (sum or Newman) are divided by the mean, then inverted (1/(weight/average weight))
- #These inverted edge weights are added when constructing paths
- #The distances here refer to the number of steps (with average tie weight) the bands are from each other
- #See: http://toreopsahl.com/tnet/weighted-networks/shortest-paths/
- obs.sw<-sw.meas(metal.bands.tn$group.member)
- print(obs.sw["clustering.coef"])
- print(obs.sw[c("binary.mean.geodesic", "sum.mean.geodesic", "Newman.mean.geodesic")])
- ret.tops<-function(x, x.names)
- return(x.names[order(x, decreasing=TRUE)])
- print(cbind(head(apply(metal.closeness$group, 2, ret.tops, x.names=rownames(metal.closeness$group)), 10), head(apply(metal.closeness$member, 2, ret.tops, x.names=rownames(metal.closeness$member)), 10)))
- rm(ret.tops)
- #Tomahawk's position
- which(rownames(metal.closeness$group)=="Tomahawk")
- ecdf(metal.closeness$group[,1])(metal.closeness$group[1309,1])
- ecdf(metal.closeness$group[,2])(metal.closeness$group[1309,2])
- ecdf(metal.closeness$group[,3])(metal.closeness$group[1309,3])
- #Mike Patton's position
- which(rownames(metal.closeness$member)=="Mike Patton")
- ecdf(metal.closeness$member[,1])(metal.closeness$member[3418,1])
- ecdf(metal.closeness$member[,2])(metal.closeness$member[3418,2])
- ecdf(metal.closeness$member[,3])(metal.closeness$member[3418,3])
- #Admire the expansive ego network of Testament
- testament.en<-graph.neighborhood(metal.ig, order=2, nodes=which(V(metal.ig)$name=="Testament"))[[1]]
- testament.lo<-layout.fruchterman.reingold(testament.en, params=list(niter=2000, area=vcount(testament.en)^3))
- V(testament.en)$x<-testament.lo[,1]
- V(testament.en)$y<-testament.lo[,2]
- rm(testament.lo)
- #The high turnover in Black Sabbath's ego network reeks of "creative differences"
- sabbath.en<-graph.neighborhood(metal.ig, order=2, nodes=which(V(metal.ig)$name=="Black Sabbath"))[[1]]
- sabbath.lo<-layout.fruchterman.reingold(sabbath.en, params=list(niter=2000, area=vcount(sabbath.en)^3))
- V(sabbath.en)$x<-sabbath.lo[,1]
- V(sabbath.en)$y<-sabbath.lo[,2]
- rm(sabbath.lo)
- png("Testament.BlackSabbath.png", height=5, width=10, units="in", res=600, bg="white")
- par(mfrow=c(1,2))
- plot(testament.en, vertex.size=5, vertex.label.family="sans", vertex.label.cex=.5, vertex.label.color="black", main="Testament")
- plot(sabbath.en, vertex.size=5, vertex.label.family="sans", vertex.label.cex=.5, vertex.label.color="black", main="Black Sabbath")
- dev.off()
- #T-test of the member degrees
- t.test(degree(testament.en)[which(!V(testament.en)$type)]-1, degree(sabbath.en)[which(!V(sabbath.en)$type)]-1, "greater")
- #Bands tied to Testament
- length(V(testament.en)$name[which(V(testament.en)$type)])-1
- #Bands tied to Black Sabbath
- length(V(sabbath.en)$name[which(V(sabbath.en)$type)])-1
- mean((degree(testament.en)[which(!V(testament.en)$type)]-1)==0)
- mean((degree(sabbath.en)[which(!V(sabbath.en)$type)]-1)==0)
- #A theoretically poor way to measure random average shortest path-length and clustering coefficient
- #Based off of the one-mode projections
- print(c(Path.Length.Random=log(length(which(V(metal.ig)$type&V(metal.ig)$gc)))/log(mean(degree(metal.ig)[which(V(metal.ig)$type&V(metal.ig)$gc)])), Clustering.Coef.Random=mean(degree(metal.ig)[which(V(metal.ig)$type)])/length(which(V(metal.ig)$type))))
- #Though the theory behind it sucks, the results aren't bad!
- #The CUG tests here are based off of reshuffled links
- #This holds the number of members in each band and the number of bands each member joins to be constant
- #Controlling for degree effects, it assumes that any identified person can join any band
- #Admittedly, this assumption isn't completely realistic, but data on controls are limited
- #For random graph tests, see
- #http://www.sciencedirect.com/science/article/pii/S0378873399000118
- #http://toreopsahl.com/tnet/two-mode-networks/random-networks/
- #Brutual computational work ahead.
- #This next command took me four unholy hours to complete.
- exp.sw<-replicate(1000, sw.meas(rg_reshuffling_tm(metal.bands.tn$group.member)))
- exp.sw<-as.data.frame(t(exp.sw))
- save.image("SW.Test.Metal.RData")
- #For the L_actual/L_random as reported by Watts (1999:516)
- obs.sw[c("binary.mean.geodesic","sum.mean.geodesic","Newman.mean.geodesic")]/apply(exp.sw[,c("binary.mean.geodesic","sum.mean.geodesic","Newman.mean.geodesic")],2, mean)
- nrow(metal.closeness$group)
- quantile(c(exp.sw$binary.vcount, exp.sw$sum.vcount, exp.sw$Newman.vcount), probs=c(.025, .975))
- #Leaving out the code that plots the histograms.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement